Пример #1
0
        public void ExractCouncil(string url, ref List <Documents> docs, ref List <QueryResult> queries)
        {
            WebClient    c  = new WebClient();
            ChromeDriver cd = new ChromeDriver();

            cd.Navigate().GoToUrl(url);
            System.Threading.Thread.Sleep(3000);
            IWebElement   rangeEle       = cd.FindElementByXPath("//select[@class='agendasearch-input']");
            SelectElement rangeSelectEle = new SelectElement(rangeEle);

            rangeSelectEle.SelectByValue("cus");
            System.Threading.Thread.Sleep(3000);
            IWebElement dateStartEle = cd.FindElementById("ctl00_ContentPlaceHolder1_SearchAgendasMeetings_radCalendarFrom_dateInput");
            IWebElement dateEndEle   = cd.FindElementById("ctl00_ContentPlaceHolder1_SearchAgendasMeetings_radCalendarTo_dateInput");

            dateStartEle.Clear();
            dateStartEle.SendKeys(this.dtStartFrom.ToString("M/d/yyyy"));
            dateEndEle.Clear();
            dateEndEle.SendKeys(DateTime.Now.ToString("M/d/yyyy"));
            System.Threading.Thread.Sleep(2000);
            IWebElement searchBtnEle = cd.FindElementById("ctl00_ContentPlaceHolder1_SearchAgendasMeetings_imageButtonSearch");

            searchBtnEle.Click();
            System.Threading.Thread.Sleep(2000);

            while (true)
            {
                try
                {
                    HtmlDocument doc = new HtmlDocument();
                    doc.LoadHtml(cd.PageSource);

                    HtmlNodeCollection rowList = doc.DocumentNode.SelectNodes("//table[@id='ctl00_ContentPlaceHolder1_SearchAgendasMeetings_radGridMeetings_ctl00']/tbody/tr[contains(@class,'Row')]");
                    if (rowList != null)
                    {
                        foreach (HtmlNode rowNode in rowList)
                        {
                            HtmlNode meetingDateNode = rowNode.SelectSingleNode("./td");
                            string   meetingText     = meetingDateNode.InnerText;
                            DateTime meetingDate     = DateTime.ParseExact(meetingText, "MM/dd/yy", null);

                            if (meetingDate < this.dtStartFrom)
                            {
                                Console.WriteLine("Too early, skip...");
                                continue;
                            }

                            HtmlNodeCollection docNodeList = rowNode.SelectNodes(".//a[contains(@href,'ashx')]");

                            if (docNodeList != null)
                            {
                                foreach (HtmlNode docNode in docNodeList)
                                {
                                    string docUrl = docNode.Attributes["href"].Value;
                                    docUrl = docUrl.StartsWith("http") ? docUrl : url.Trim('#') + docUrl;
                                    Documents localdoc = docs.FirstOrDefault(t => t.DocSource == docUrl);

                                    if (localdoc == null)
                                    {
                                        string tag = docUrl.Contains("Minute") ? "Minute" : "Agenda";
                                        localdoc           = new Documents();
                                        localdoc.DocId     = Guid.NewGuid().ToString();
                                        localdoc.CityId    = this.cityEntity.CityId;
                                        localdoc.Checked   = false;
                                        localdoc.Important = false;
                                        localdoc.DocType   = "City Council";
                                        localdoc.DocSource = docUrl;
                                        string localFile = string.Format("{0}\\Council_{1}_{2}.pdf", this.localDirectory, tag, meetingDate.ToString("yyyy-MM-dd"));
                                        localdoc.DocLocalPath = localFile;

                                        try
                                        {
                                            c.DownloadFile(docUrl, localFile);
                                        }
                                        catch
                                        {
                                        }

                                        docs.Add(localdoc);
                                    }
                                    else
                                    {
                                        Console.WriteLine("This file already downloaded...");
                                    }

                                    this.ReadText(false, localdoc.DocLocalPath, ref localdoc);
                                    QueryResult qr = queries.FirstOrDefault(t => t.DocId == localdoc.DocId);

                                    if (qr == null)
                                    {
                                        qr             = new QueryResult();
                                        qr.CityId      = localdoc.CityId;
                                        qr.DocId       = localdoc.DocId;
                                        qr.MeetingDate = meetingDate;
                                        qr.SearchTime  = DateTime.Now;

                                        queries.Add(qr);
                                    }

                                    this.ExtractQueriesFromDoc(localdoc, ref qr);
                                    Console.WriteLine("{0} docs added, {1} queries added...", docs.Count, queries.Count);
                                    this.SaveMeetingResultsToSQL(docs, queries);
                                }
                            }
                        }
                    }

                    IWebElement nextPageBtnEle = cd.FindElementByXPath("//a[@title='Next Page']");
                    nextPageBtnEle.Click();
                    System.Threading.Thread.Sleep(3000);
                }
                catch (Exception ex)
                {
                    Console.ForegroundColor = ConsoleColor.Red;
                    Console.WriteLine("Reach last page...");
                    Console.ResetColor();
                    break;
                }
            }

            cd.Quit();
            cd = null;
        }
Пример #2
0
        public void DownloadCouncilPdfFiles()
        {
            List <Documents>   docs    = this.LoadDocumentsDoneSQL();
            List <QueryResult> queries = this.LoadQueriesDoneSQL();
            WebClient          c       = new WebClient();
            HtmlWeb            web     = new HtmlWeb();
            Regex dateReg  = new Regex("(0|1)[0-9]{1}[0-9]{2}[0-9]{2}|(0|1)[0-9]{1}-[0-9]{2}-[0-9]{2}");
            Regex dateReg1 = new Regex("[0-9]{4}-[0-9]{2}-[0-9]{2}");
            Regex digitReg = new Regex("[0-9]{4}");

            foreach (string url in this.docUrls)
            {
                string             category    = url.Split('*')[0];
                string             categoryUrl = url.Split('*')[1];
                HtmlDocument       doc         = web.Load(categoryUrl);
                HtmlNodeCollection docNodes    = doc.DocumentNode.SelectNodes("//td//a[contains(@href,'.pdf')]");

                if (docNodes != null)
                {
                    foreach (HtmlNode docNode in docNodes)
                    {
                        Console.WriteLine("DEBUG:{0}...", categoryUrl);

                        string   fileUrl         = this.cityEntity.CityUrl + docNode.Attributes["href"].Value;
                        string   meetingDateText = string.Empty;
                        DateTime meetingDate     = DateTime.MinValue;
                        if (dateReg1.IsMatch(fileUrl))
                        {
                            meetingDateText = dateReg1.Match(fileUrl).ToString();
                            Console.WriteLine("DEBUG: meeting date 1: {0}", meetingDateText);
                            meetingDate = DateTime.Parse(meetingDateText);
                        }
                        else if (dateReg.IsMatch(fileUrl))
                        {
                            meetingDateText = dateReg.Match(fileUrl).ToString();
                            meetingDateText = meetingDateText.Replace("-", string.Empty);
                            Console.WriteLine("DEBUG: meeting date 2: {0}", meetingDateText);
                            meetingDate = DateTime.ParseExact(meetingDateText, "MMddyy", null);
                        }
                        else
                        {
                            string year = digitReg.Match(fileUrl.Split('/').LastOrDefault()).ToString();
                            year            = year.Substring(year.Length - 2, 2);
                            year            = (2000 + int.Parse(year)).ToString();
                            meetingDateText = string.Format("{0}, {1}", docNode.InnerText, year);
                            Console.WriteLine("DEBUG: meeting date 3: {0}", meetingDateText);
                            meetingDate = DateTime.Parse(meetingDateText);
                        }

                        if (meetingDate < this.dtStartFrom)
                        {
                            Console.WriteLine("Too early, skip...");
                            continue;
                        }

                        Documents localdoc = docs.FirstOrDefault(t => t.DocSource == fileUrl);

                        if (localdoc == null)
                        {
                            localdoc              = new Documents();
                            localdoc.DocId        = Guid.NewGuid().ToString();
                            localdoc.CityId       = this.cityEntity.CityId;
                            localdoc.DocType      = category;
                            localdoc.DocSource    = fileUrl;
                            localdoc.Important    = false;
                            localdoc.Checked      = false;
                            localdoc.DocLocalPath = string.Format("{0}\\{1}", this.localDirectory, fileUrl.Split('/').LastOrDefault());

                            try
                            {
                                c.DownloadFile(fileUrl, localdoc.DocLocalPath);
                            }
                            catch (Exception ex)
                            { }

                            docs.Add(localdoc);
                        }
                        else
                        {
                            Console.WriteLine("This file already downloaded...");
                        }

                        this.ReadText(false, localdoc.DocLocalPath, ref localdoc);
                        QueryResult qr = queries.FirstOrDefault(t => t.DocId == localdoc.DocId);

                        if (qr == null)
                        {
                            qr             = new QueryResult();
                            qr.SearchTime  = DateTime.Now;
                            qr.MeetingDate = meetingDate;
                            qr.DocId       = localdoc.DocId;
                            qr.CityId      = localdoc.CityId;
                            queries.Add(qr);
                        }

                        this.ExtractQueriesFromDoc(localdoc, ref qr);
                        Console.WriteLine("{0} docs saved, {1} queries added...", docs.Count, queries.Count);
                        this.SaveMeetingResultsToSQL(docs, queries);
                    }
                }
            }
        }
Пример #3
0
        public void DownloadCouncilPdfFiles()
        {
            var          docs    = this.LoadDocumentsDoneSQL();
            var          queries = this.LoadQueriesDoneSQL();
            ChromeDriver cd      = new ChromeDriver();
            Regex        dateReg = new Regex("[a-zA-Z]+[\\s]{0,1}[0-9]{1,2}[\\s]{0,1}[0-9]{4}");

            foreach (string docUrl in this.docUrls)
            {
                string category    = docUrl.Split('*')[0];
                string categoryUrl = docUrl.Split('*')[1];
                cd.Navigate().GoToUrl(categoryUrl);
                System.Threading.Thread.Sleep(2000);
                HtmlDocument yearListDoc = new HtmlDocument();
                yearListDoc.LoadHtml(cd.PageSource);
                HtmlNodeCollection yearNodeList = yearListDoc.DocumentNode.SelectNodes("//div[@class='media']//a[@class='page-list__item']");

                for (int i = this.dtStartFrom.Year; i <= DateTime.Now.Year; i++)
                {
                    Console.WriteLine("Working on year {0}...", i);
                    var yearNode = yearNodeList.FirstOrDefault(t => t.InnerText.Contains(i.ToString()));

                    if (yearNode != null)
                    {
                        string yearUrl = this.cityEntity.CityUrl + yearNode.Attributes["href"].Value;
                        cd.Navigate().GoToUrl(yearUrl);
                        System.Threading.Thread.Sleep(2000);
                        var fileList = cd.FindElementsByXPath("//a[@class='blog__post']");
                        Console.WriteLine("{0} nodes found...", fileList.Count);
                        int page = 1;
                        do
                        {
                            bool breakNow      = false;
                            var  nextButtonEle = cd.FindElementByXPath("//a[@class='su_next']/parent::li");
                            if (nextButtonEle.GetAttribute("class") != "next")
                            {
                                Console.WriteLine("Reach last page...");
                                breakNow = true;
                            }

                            if (fileList.All(t =>
                            {
                                string href = t.GetAttribute("href");
                                string text = t.Text;
                                return(string.IsNullOrEmpty(href) || string.IsNullOrEmpty(text));
                            }))
                            {
                                Console.WriteLine("No doc...");
                                fileList = null;
                                continue;
                            }

                            var fileUrlDic = new Dictionary <string, string>();
                            foreach (var ele in fileList)
                            {
                                string url = ele.GetAttribute("href");

                                if (!string.IsNullOrEmpty(url))
                                {
                                    fileUrlDic.Add(url, ele.Text);
                                }
                            }

                            foreach (string url in fileUrlDic.Keys)
                            {
                                string meetingDateText = dateReg.Match(fileUrlDic[url]).ToString();
                                Console.WriteLine("DEBUG: {0} - {1}", meetingDateText, fileUrlDic[url]);
                                DateTime meetingDate = DateTime.Parse(meetingDateText);

                                if (meetingDate < this.dtStartFrom)
                                {
                                    Console.WriteLine("Too early, skip...");
                                    continue;
                                }

                                var localdoc = docs.FirstOrDefault(t => t.DocSource == url);

                                if (localdoc == null)
                                {
                                    localdoc           = new Documents();
                                    localdoc.DocId     = Guid.NewGuid().ToString();
                                    localdoc.CityId    = this.cityEntity.CityId;
                                    localdoc.Checked   = false;
                                    localdoc.Important = false;
                                    localdoc.DocType   = category;
                                    localdoc.DocSource = url;
                                    string localPath = string.Format("{0}\\{1}_{2}_{3}.html",
                                                                     this.localDirectory,
                                                                     category,
                                                                     meetingDate.ToString("yyyy-MM-dd"),
                                                                     localdoc.DocId);
                                    localdoc.DocLocalPath = localPath;
                                    localdoc.Readable     = true;
                                    docs.Add(localdoc);

                                    cd.Navigate().GoToUrl(url);
                                    System.Threading.Thread.Sleep(1000);
                                    var    targetEle = cd.FindElementByXPath("//div[@class='su_bootstrap_safe su-content-wrapper']");
                                    string js        = "document.documentElement.scrollTop=" + targetEle.Location.Y;
                                    ((IJavaScriptExecutor)cd).ExecuteScript(js);
                                    System.Threading.Thread.Sleep(1000);
                                    string meetingText = targetEle.Text;
                                    localdoc.DocBodyDic.Add(1, meetingText);
                                    File.WriteAllText(localdoc.DocLocalPath, meetingText);
                                }
                                else
                                {
                                    Console.WriteLine("This file already downloaded....");
                                    localdoc.DocBodyDic.Add(1, File.ReadAllText(localdoc.DocLocalPath));
                                }

                                var qr = queries.FirstOrDefault(t => t.DocId == localdoc.DocId);

                                if (qr == null)
                                {
                                    qr             = new QueryResult();
                                    qr.CityId      = this.cityEntity.CityId;
                                    qr.DocId       = localdoc.DocId;
                                    qr.SearchTime  = DateTime.Now;
                                    qr.MeetingDate = meetingDate;
                                    qr.QueryId     = Guid.NewGuid().ToString();
                                    queries.Add(qr);
                                }

                                this.ExtractQueriesFromDoc(localdoc, ref qr);
                                Console.WriteLine("{0} docs saved, {1} queries save...", docs.Count, queries.Count);
                            }

                            this.SaveMeetingResultsToSQL(docs, queries);
                            page++;

                            if (breakNow)
                            {
                                break;
                            }

                            Console.WriteLine("Go to page {0}...", page);
                            string newPage = yearUrl + "?page=" + page;
                            cd.Navigate().GoToUrl(newPage);
                            System.Threading.Thread.Sleep(2000);
                            fileList = cd.FindElementsByXPath("//a[@class='blog__post']");
                        }while (fileList != null && fileList.Count > 0);
                    }
                }
            }

            cd.Quit();
            cd = null;
        }
Пример #4
0
        public void DownloadCouncilPdfFiles()
        {
            List <Documents>   docs      = this.LoadDocumentsDoneSQL();
            List <QueryResult> queries   = this.LoadQueriesDoneSQL();
            WebClient          c         = new WebClient();
            HtmlWeb            web       = new HtmlWeb();
            List <string>      rangeUrls = new List <string>();

            for (int i = dtStartFrom.Year; i <= DateTime.Now.Year; i++)
            {
                DateTime rangeStart = new DateTime(i, 1, 1);;
                DateTime rangeEnd   = new DateTime(i, 12, 31);
                string   rangeUrl   = string.Format("{0}?From={1}&To={2}",
                                                    this.docUrls[0],
                                                    rangeStart.ToString("M/d/yyyy"),
                                                    rangeEnd.ToString("M/d/yyyy"));
                rangeUrls.Add(rangeUrl);
            }

            foreach (string rangeUrl in rangeUrls)
            {
                HtmlDocument       doc         = web.Load(rangeUrl);
                HtmlNodeCollection recordNodes = doc.DocumentNode.SelectNodes("//div[contains(@class,'Row MeetingRow')]");

                if (recordNodes != null)
                {
                    foreach (HtmlNode recordNode in recordNodes)
                    {
                        List <HtmlNode> docNodes   = new List <HtmlNode>();
                        HtmlNode        agendaNode = recordNode.SelectSingleNode(".//a[text()='Agenda']");
                        if (agendaNode != null && string.IsNullOrEmpty(agendaNode.Attributes["href"].Value) == false)
                        {
                            docNodes.Add(agendaNode);
                        }
                        HtmlNode minuteNode = recordNode.SelectSingleNode(".//a[text()='Minutes']");
                        if (minuteNode != null && string.IsNullOrEmpty(minuteNode.Attributes["href"].Value) == false)
                        {
                            docNodes.Add(minuteNode);
                        }
                        HtmlNode agendePacketNode = recordNode.SelectSingleNode(".//a[text()='Agenda Packet']");
                        if (agendePacketNode != null && string.IsNullOrEmpty(agendePacketNode.Attributes["href"].Value) == false)
                        {
                            docNodes.Add(agendePacketNode);
                        }

                        if (docNodes.Count == 0)
                        {
                            Console.WriteLine("No files found....");
                            continue;
                        }

                        HtmlNode meetingCategoryNode = recordNode.SelectSingleNode(".//div[@class='MainScreenText RowDetails']");
                        string   category            = meetingCategoryNode == null ? string.Empty : meetingCategoryNode.InnerText;

                        if (category.Contains("City Commission"))
                        {
                            category = "City Council";
                        }
                        else if (category.Contains("Board of Zoning Appeals"))
                        {
                            category = "Zoning Board of Appeals";
                        }
                        else if (category.Contains("Planning Commission"))
                        {
                            category = "Planning Commission";
                        }
                        else
                        {
                            continue;
                        }

                        HtmlNode dateNode        = recordNode.SelectSingleNode(".//div[@class='RowLink']/a");
                        string   meetingDateText = dateNode.InnerText;
                        DateTime meetingDate     = DateTime.Parse(meetingDateText);
                        if (meetingDate < this.dtStartFrom)
                        {
                            Console.WriteLine("Too early, skip...");
                            continue;
                        }
                        foreach (HtmlNode docNode in docNodes)
                        {
                            string docUrl = "http://grandrapidscitymi.iqm2.com/Citizens/" + docNode.Attributes["href"].Value.Replace("&amp;", "&");
                            docUrl = docUrl.Replace("FileView", "FileOpen");
                            Documents localdoc = docs.FirstOrDefault(t => t.DocSource == docUrl);

                            if (localdoc == null)
                            {
                                localdoc              = new Documents();
                                localdoc.DocId        = Guid.NewGuid().ToString();
                                localdoc.CityId       = this.cityEntity.CityId;
                                localdoc.DocSource    = docUrl;
                                localdoc.Important    = false;
                                localdoc.Checked      = false;
                                localdoc.DocType      = category;
                                localdoc.DocLocalPath = string.Format("{0}\\{1}_{2}_{3}.pdf", this.localDirectory,
                                                                      category, docNode.InnerText, Guid.NewGuid().ToString());

                                try
                                {
                                    c.Headers.Add("user-agent", "Chrome");
                                    c.DownloadFile(docUrl, localdoc.DocLocalPath);
                                }
                                catch (Exception ex) { }

                                docs.Add(localdoc);
                            }
                            else
                            {
                                Console.WriteLine("This file already downloaded...");
                            }

                            this.ReadText(false, localdoc.DocLocalPath, ref localdoc);
                            QueryResult qr = queries.FirstOrDefault(t => t.DocId == localdoc.DocId);

                            if (qr == null)
                            {
                                qr             = new QueryResult();
                                qr.CityId      = this.cityEntity.CityId;
                                qr.DocId       = localdoc.DocId;
                                qr.SearchTime  = DateTime.Now;
                                qr.MeetingDate = meetingDate;
                                queries.Add(qr);
                            }

                            this.ExtractQueriesFromDoc(localdoc, ref qr);
                            Console.WriteLine("{0} docs added, {1} queries added...", docs.Count, queries.Count);
                            this.SaveMeetingResultsToSQL(docs, queries);
                        }
                    }
                }
            }
        }
Пример #5
0
        private void ExtractAgendas(HtmlDocument doc, string url, string category, ref List <Documents> docs, ref List <QueryResult> queries)
        {
            try
            {
                WebClient          c            = new WebClient();
                HtmlNodeCollection agendasNodes = doc.DocumentNode.SelectNodes("//table[@class='tablewithheadingresponsive']/tbody/tr[@class]");

                if (agendasNodes != null)
                {
                    foreach (HtmlNode agendaNode in agendasNodes)
                    {
                        HtmlNode meetingDateNode = agendaNode.SelectSingleNode("./td");
                        string   meetingDateText = meetingDateNode.InnerText.Split('-').FirstOrDefault().Trim((char)32, (char)160);
                        DateTime meetingDate     = string.IsNullOrEmpty(meetingDateText) ? DateTime.MinValue : DateTime.Parse(meetingDateText);
                        HtmlNode agendaDocNode   = agendaNode.SelectSingleNode(".//div[@class='html']/a");

                        string fileUrl = agendaDocNode == null ? string.Empty : agendaDocNode.Attributes["href"].Value;

                        if (string.IsNullOrEmpty(fileUrl))
                        {
                            continue;
                        }

                        fileUrl = fileUrl.ToLower().StartsWith("http") ? fileUrl : this.cityEntity.CityUrl + fileUrl;
                        Documents    localDoc      = docs.FirstOrDefault(t => t.DocSource == fileUrl);
                        HtmlDocument agendaHtmlDoc = new HtmlDocument();

                        if (localDoc == null)
                        {
                            localDoc          = new Documents();
                            localDoc.CityId   = this.cityEntity.CityId;
                            localDoc.Checked  = false;
                            localDoc.DocType  = category;
                            localDoc.DocId    = Guid.NewGuid().ToString();
                            localDoc.Readable = true;
                            string html = c.DownloadString(fileUrl);
                            agendaHtmlDoc.LoadHtml(html);
                            HtmlNode agendaBodyNode = agendaHtmlDoc.GetElementbyId("interiorcontenttext");
                            string   localFile      = string.Format("{0}\\{1}_{2}", this.localDirectory, Guid.NewGuid().ToString(), fileUrl.Split('?').FirstOrDefault().Split('/').LastOrDefault());
                            File.WriteAllText(localFile, html, Encoding.UTF8);
                            localDoc.DocBodyDic.Add(1, agendaBodyNode.InnerText);
                            docs.Add(localDoc);
                        }
                        else
                        {
                            string html = File.ReadAllText(localDoc.DocLocalPath);
                            agendaHtmlDoc.LoadHtml(html);
                            HtmlNode agendaBodyNode = agendaHtmlDoc.GetElementbyId("interiorcontenttext");
                            if (localDoc.DocBodyDic.Count == 0)
                            {
                                localDoc.DocBodyDic.Add(1, agendaDocNode.InnerText);
                            }
                            Console.ForegroundColor = ConsoleColor.Yellow;
                            Console.WriteLine("This document already downloaded...");
                            Console.ResetColor();
                        }

                        QueryResult qr = queries.FirstOrDefault(t => t.DocId == localDoc.DocId);

                        if (qr == null)
                        {
                            qr             = new QueryResult();
                            qr.DocId       = localDoc.DocId;
                            qr.CityId      = localDoc.CityId;
                            qr.SearchTime  = DateTime.Now;
                            qr.MeetingDate = meetingDate;

                            queries.Add(qr);
                        }

                        this.ExtractQueriesFromDoc(localDoc, ref qr);
                        Console.WriteLine("{0} docs added, {1} queries added...", docs.Count, queries.Count);
                        this.ExtractMoreAgenda(agendaHtmlDoc, category, meetingDate, ref docs, ref queries);
                    }
                }
            }
            catch (Exception ex)
            {
                Console.ForegroundColor = ConsoleColor.Red;
                Console.WriteLine("DEBUG EXCEPTION:{0}", ex.ToString());
                Console.WriteLine("CURRENT URL: {0}", url);
                Console.ResetColor();
            }
            this.SaveMeetingResultsToSQL(docs, queries);
        }
Пример #6
0
        public void ExtractHtml(HtmlWeb web, WebClient c, DateTime meetingDate, string category, string url, ref List <Documents> docs, ref List <QueryResult> queries)
        {
            Documents localdoc = docs.FirstOrDefault(t => t.DocSource == url);

            if (localdoc == null)
            {
                localdoc              = new Documents();
                localdoc.DocId        = Guid.NewGuid().ToString();
                localdoc.CityId       = this.cityEntity.CityId;
                localdoc.DocSource    = url;
                localdoc.DocType      = category;
                localdoc.Checked      = false;
                localdoc.Important    = false;
                localdoc.DocLocalPath = string.Format("{0}\\{1}_Agenda_{2}.html",
                                                      this.localDirectory,
                                                      category,
                                                      Guid.NewGuid().ToString());

                try
                {
                    string html = c.DownloadString(url);
                    File.WriteAllText(localdoc.DocLocalPath, html);
                }
                catch
                { }

                docs.Add(localdoc);
            }
            else
            {
                Console.WriteLine("This file already downloaded...");
            }

            HtmlDocument htmlDoc = new HtmlDocument();

            htmlDoc.LoadHtml(File.ReadAllText(localdoc.DocLocalPath));
            HtmlNodeCollection pdfFileNodes = htmlDoc.DocumentNode.SelectNodes("//a[contains(@href,'ViewFile')]");

            localdoc.DocBodyDic.Clear();
            localdoc.DocBodyDic.Add(1, htmlDoc.DocumentNode.InnerText);
            QueryResult qr = queries.FirstOrDefault(t => t.DocId == localdoc.DocId);

            if (qr == null)
            {
                qr             = new QueryResult();
                qr.CityId      = localdoc.CityId;
                qr.DocId       = localdoc.DocId;
                qr.SearchTime  = DateTime.Now;
                qr.MeetingDate = meetingDate;
                queries.Add(qr);
            }

            this.ExtractQueriesFromDoc(localdoc, ref qr);
            Console.WriteLine("{0} docs added, {1} queries added...", docs.Count, queries.Count);
            this.SaveMeetingResultsToSQL(docs, queries);

            if (pdfFileNodes != null)
            {
                foreach (HtmlNode pdfNode in pdfFileNodes)
                {
                    if (!pdfNode.InnerText.ToLower().Contains(".pdf"))
                    {
                        continue;
                    }

                    string pdfUrl = this.cityEntity.CityUrl + pdfNode.Attributes["href"].Value;
                    this.ExtractPdf(c, meetingDate, pdfUrl, category, pdfNode.InnerText, ref docs, ref queries);
                }
            }
        }
        private void ExtractAgendas(HtmlWeb web, string docUrl, string category, DateTime meetingDate, ref List <Documents> docs, ref List <QueryResult> queries)
        {
            Documents    localdoc = docs.FirstOrDefault(t => t.DocSource == docUrl);
            HtmlDocument doc      = new HtmlDocument();

            if (localdoc == null)
            {
                localdoc              = new Documents();
                localdoc.DocId        = Guid.NewGuid().ToString();
                localdoc.CityId       = this.cityEntity.CityId;
                localdoc.DocSource    = docUrl;
                localdoc.DocType      = category;
                localdoc.Important    = false;
                localdoc.Checked      = false;
                localdoc.DocLocalPath = string.Format("{0}\\{1}_{2}_{3}.html",
                                                      this.localDirectory,
                                                      category,
                                                      meetingDate.ToString("yyyy-MM-dd"),
                                                      Guid.NewGuid().ToString());

                try
                {
                    doc = web.Load(docUrl);
                    doc.Save(localdoc.DocLocalPath);
                }
                catch
                { }

                docs.Add(localdoc);
            }
            else
            {
                Console.WriteLine("This document already downloaded...");
            }

            doc.LoadHtml(File.ReadAllText(localdoc.DocLocalPath));
            localdoc.DocBodyDic.Add(1, doc.DocumentNode.InnerText);
            HtmlNodeCollection fileNodes = doc.DocumentNode.SelectNodes("//a[contains(@href,'/ViewFile/')]");
            QueryResult        qr        = queries.FirstOrDefault(t => t.DocId == localdoc.DocId);

            if (qr == null)
            {
                qr             = new QueryResult();
                qr.DocId       = localdoc.DocId;
                qr.CityId      = this.cityEntity.CityId;
                qr.MeetingDate = meetingDate;
                qr.SearchTime  = DateTime.Now;
                queries.Add(qr);
            }

            this.ExtractQueriesFromDoc(localdoc, ref qr);
            Console.WriteLine("{0} docs added, {1} queries added...", docs.Count, queries.Count);
            this.SaveMeetingResultsToSQL(docs, queries);

            if (fileNodes != null)
            {
                WebClient c = new WebClient();
                foreach (HtmlNode fileNode in fileNodes)
                {
                    string fileUrl = this.cityEntity.CityUrl + fileNode.Attributes["href"].Value;

                    if (fileUrl.Contains("pdf"))
                    {
                        this.ExtractADoc(c, fileUrl, category, "pdf", meetingDate, ref docs, ref queries);
                    }
                }
            }
        }
Пример #8
0
        public void DownloadCouncilPdfFiles()
        {
            var       docs    = this.LoadDocumentsDoneSQL();
            var       queries = this.LoadQueriesDoneSQL();
            WebClient c       = new WebClient();
            HtmlWeb   web     = new HtmlWeb();
            Regex     dateReg = new Regex("[a-zA-Z]+[\\s]{0,2}[0-9]{1,2},[\\s]{0,2}[0-9]{4}");

            foreach (string url in this.docUrls)
            {
                string category    = url.Split('*')[0];
                string categoryUrl = url.Split('*')[1];

                for (int i = 1; ; i++)
                {
                    Console.WriteLine("Working on page {0}...", i);
                    string       categoryPagedUrl = i == 1 ? categoryUrl : string.Format("{0}&paged={1}", categoryUrl, i);
                    HtmlDocument listDoc          = web.Load(categoryPagedUrl);
                    HtmlNode     notFoundNode     = listDoc.DocumentNode.SelectSingleNode("//section[@class='error-404 not-found']");

                    if (notFoundNode != null)
                    {
                        break;
                    }

                    HtmlNodeCollection entryNodes = listDoc.DocumentNode.SelectNodes("//article[contains(@id,'post')]");

                    if (entryNodes != null)
                    {
                        foreach (HtmlNode entryNode in entryNodes)
                        {
                            HtmlNode dateNode        = entryNode.SelectSingleNode(".//time[contains(@class,'entry-date published')]");
                            string   meetingDateText = dateNode.InnerText;
                            DateTime meetingDate     = DateTime.Parse(meetingDateText);

                            if (meetingDate < this.dtStartFrom)
                            {
                                Console.WriteLine("Early! Skip....");
                                continue;
                            }

                            HtmlNode meetingUrlNode = dateNode == null ? null : dateNode.ParentNode;
                            string   meetingUrl     = meetingUrlNode == null ? string.Empty : meetingUrlNode.Attributes["href"].Value;

                            if (!string.IsNullOrEmpty(meetingUrl))
                            {
                                HtmlNode  contentNode = entryNode.SelectSingleNode(".//div[@class='entry-content']");
                                Documents localdoc    = docs.FirstOrDefault(t => t.DocSource == meetingUrl);

                                if (localdoc == null)
                                {
                                    localdoc              = new Documents();
                                    localdoc.DocId        = Guid.NewGuid().ToString();
                                    localdoc.CityId       = this.cityEntity.CityId;
                                    localdoc.DocType      = category;
                                    localdoc.Checked      = false;
                                    localdoc.Important    = false;
                                    localdoc.Readable     = true;
                                    localdoc.DocSource    = meetingUrl;
                                    localdoc.DocLocalPath = string.Format("{0}\\{1}_{2}.html", this.localDirectory, category, Guid.NewGuid().ToString());
                                    File.WriteAllText(localdoc.DocLocalPath, contentNode.OuterHtml);
                                    docs.Add(localdoc);
                                }
                                else
                                {
                                    Console.WriteLine("This file already downloaded...");
                                }

                                localdoc.DocBodyDic.Add(1, contentNode.InnerText);
                                QueryResult qr = queries.FirstOrDefault(q => q.DocId == localdoc.DocId);

                                if (qr == null)
                                {
                                    qr             = new QueryResult();
                                    qr.MeetingDate = meetingDate;
                                    qr.SearchTime  = DateTime.Now;
                                    qr.QueryId     = Guid.NewGuid().ToString();
                                    qr.DocId       = localdoc.DocId;
                                    queries.Add(qr);
                                }

                                this.ExtractQueriesFromDoc(localdoc, ref qr);
                                Console.WriteLine("{0} docs added, {1} queries added...", docs.Count, queries.Count);
                            }
                        }

                        this.SaveMeetingResultsToSQL(docs, queries);
                    }
                }
            }
        }
Пример #9
0
        protected void ExtractQueriesFromDoc(Documents doc, ref QueryResult qr)
        {
            foreach (int page in doc.DocBodyDic.Keys)
            {
                string text = doc.DocBodyDic[page];

                foreach (string searchterm in searchterms)
                {
                    QueryResult.KeywordEntry entry = qr.Entries.FirstOrDefault(t => t.Keyword == searchterm && t.PageNumber == page);

                    if (entry == null)
                    {
                        entry            = new QueryResult.KeywordEntry();
                        entry.Keyword    = searchterm;
                        entry.PageNumber = page;
                        entry.GuidDic    = new Dictionary <string, string>();
                        entry.CommentDic = new Dictionary <string, string>();
                        qr.Entries.Add(entry);
                    }

                    if (text.ToLower().Contains(searchterm.ToLower()))
                    {
                        string[]      bodyWords   = text.Split(' ');
                        string[]      targetWords = Array.FindAll(bodyWords, t => t.ToLower().StartsWith(searchterm.ToLower()));
                        List <string> entryLines  = entry.Contents;

                        int indexOfCurrent = 0;
                        for (int j = 0; j < targetWords.Length; j++)
                        {
                            indexOfCurrent = Array.IndexOf(bodyWords, targetWords[j], indexOfCurrent + 1);
                            List <string> words      = new List <string>();
                            int           rangeStart = indexOfCurrent - 11;
                            int           rangeEnd   = indexOfCurrent + 11;
                            rangeStart = rangeStart < 0 ? 0 : rangeStart;
                            rangeEnd   = rangeEnd >= bodyWords.Length ? bodyWords.Length - 1 : rangeEnd;

                            for (int i = rangeStart; i <= rangeEnd; i++)
                            {
                                words.Add(bodyWords[i]);
                            }

                            string line = string.Join(" ", words.Select(t => t.Replace("\r", string.Empty).Replace("\n", " ")));

                            if (words.Count < 20)
                            {
                                Console.WriteLine("Please check!");
                            }

                            if (!entryLines.Exists(t => t.Contains(line)))
                            {
                                entryLines.Add(line);
                            }
                        }

                        Console.ForegroundColor = ConsoleColor.Green;
                        Console.WriteLine("search term {0} appears {1} times in document {2}...", searchterm, targetWords.Length, doc.DocLocalPath);
                        Console.ResetColor();
                        entry.Contents = entryLines;

                        foreach (string content in entry.Contents)
                        {
                            if (!entry.GuidDic.ContainsKey(content))
                            {
                                entry.GuidDic.Add(content, Guid.NewGuid().ToString());
                            }

                            if (!entry.CommentDic.ContainsKey(content))
                            {
                                entry.CommentDic.Add(content, string.Empty);
                            }
                        }
                    }

                    if (entry.Contents.Count == 0)
                    {
                        qr.Entries.Remove(entry);
                    }
                }

                foreach (string searchTermD in searchTermsDependency)
                {
                    QueryResult.KeywordEntry entry = qr.Entries.FirstOrDefault(t => t.Keyword == searchTermD && t.PageNumber == page);
                    if (entry == null)
                    {
                        entry            = new QueryResult.KeywordEntry();
                        entry.Keyword    = searchTermD;
                        entry.PageNumber = page;
                        qr.Entries.Add(entry);
                    }

                    Console.WriteLine("Search {0}...", searchTermD);
                    if (text.ToLower().Contains(searchTermD.ToLower()))
                    {
                        Console.ForegroundColor = ConsoleColor.Yellow;
                        Console.WriteLine("Dependency search term {0} found...", searchTermD);
                        Console.ResetColor();
                        string[] bodyWords   = text.Split(' ');
                        string[] targetWords = Array.FindAll(bodyWords, t => t.ToLower().StartsWith(searchTermD.ToLower()));

                        if (targetWords.Length != 0)
                        {
                            List <string> entryLines     = entry.Contents;
                            int           indexOfCurrent = 0;

                            for (int j = 0; j < targetWords.Length; j++)
                            {
                                indexOfCurrent = Array.IndexOf(bodyWords, targetWords[j], indexOfCurrent);
                                List <string> words      = new List <string>();
                                int           rangeStart = indexOfCurrent - 11;
                                int           rangeEnd   = indexOfCurrent + 11;
                                rangeStart = rangeStart < 0 ? 0 : rangeStart;
                                rangeEnd   = rangeEnd >= bodyWords.Length ? bodyWords.Length - 1 : rangeEnd;

                                for (int i = rangeStart; i <= rangeEnd; i++)
                                {
                                    words.Add(bodyWords[i]);
                                }

                                string        line         = string.Join(" ", words.Select(t => t.Replace("\r", string.Empty).Replace("\n", " ")));
                                List <string> termsDepends = "marijuana;marihuana;cannabis;Dispensary;Dispensaries;provisioning;Cultivat".Split(';').ToList();
                                if (termsDepends.Exists(t => line.ToLower().Contains(t.ToLower())) && entryLines.Exists(t => t.Contains(line) == false))
                                {
                                    entryLines.Add(line);
                                }
                            }

                            Console.ForegroundColor = ConsoleColor.Green;
                            Console.WriteLine("Search term {0} appears {1} times...", searchTermD, entryLines.Count);
                            Console.ResetColor();
                            entry.Contents = entryLines;

                            foreach (string content in entry.Contents)
                            {
                                if (!entry.GuidDic.ContainsKey(content))
                                {
                                    entry.GuidDic.Add(content, Guid.NewGuid().ToString());
                                }

                                if (!entry.CommentDic.ContainsKey(content))
                                {
                                    entry.CommentDic.Add(content, string.Empty);
                                }
                            }
                        }
                    }

                    if (entry.Contents.Count == 0)
                    {
                        qr.Entries.Remove(entry);
                    }
                }
            }
        }
Пример #10
0
        public void DownloadCouncilPdfFiles()
        {
            List <Documents>   docs    = this.LoadDocumentsDoneSQL();
            List <QueryResult> queries = this.LoadQueriesDoneSQL();
            WebClient          c       = new WebClient();
            HtmlWeb            web     = new HtmlWeb();
            Regex dateReg = new Regex("[a-zA-Z]+[\\s]{0,1}[0-9]{1,2},[\\s]{0,1}[0-9]{4}");

            foreach (string url in this.docUrls)
            {
                string category = url.Split('*')[0];
                string listUrl  = url.Split('*')[1];

                HtmlDocument       doc         = web.Load(listUrl);
                HtmlNodeCollection docNodeList = doc.DocumentNode.SelectNodes("//table[@id='table2']/tbody/tr");

                if (docNodeList != null)
                {
                    foreach (HtmlNode docNode in docNodeList)
                    {
                        HtmlNode dateNode        = docNode.SelectSingleNode(".//strong");
                        string   meetingDateText = dateReg.Match(dateNode.InnerText).ToString();
                        DateTime meetingDate     = DateTime.Parse(meetingDateText);
                        if (meetingDate < this.dtStartFrom)
                        {
                            Console.WriteLine("Too early, skip...");
                            continue;
                        }
                        HtmlNode agendaNode = dateNode.ParentNode;
                        HtmlNode minuteNode = docNode.SelectSingleNode(".//a[contains(@href,'Minute')]");

                        Dictionary <string, string> urlDic = new Dictionary <string, string>();
                        urlDic.Add(this.cityEntity.CityUrl + agendaNode.Attributes["href"].Value, "agenda");

                        if (minuteNode != null)
                        {
                            urlDic.Add(this.cityEntity.CityUrl + minuteNode.Attributes["href"].Value, "minute");
                        }

                        foreach (string key in urlDic.Keys)
                        {
                            Documents localdoc = docs.FirstOrDefault(t => t.DocSource == key);

                            if (localdoc == null)
                            {
                                localdoc              = new Documents();
                                localdoc.DocId        = Guid.NewGuid().ToString();
                                localdoc.CityId       = this.cityEntity.CityId;
                                localdoc.Checked      = false;
                                localdoc.Important    = false;
                                localdoc.DocType      = category;
                                localdoc.DocSource    = key;
                                localdoc.DocLocalPath = string.Format("{0}\\{1}_{2}.pdf",
                                                                      this.localDirectory,
                                                                      urlDic[key],
                                                                      Guid.NewGuid().ToString());

                                try
                                {
                                    c.DownloadFile(key, localdoc.DocLocalPath);
                                }
                                catch { }

                                docs.Add(localdoc);
                            }
                            else
                            {
                                Console.WriteLine("This file already downloaded...");
                            }

                            this.ReadText(false, localdoc.DocLocalPath, ref localdoc);
                            QueryResult qr = queries.FirstOrDefault(t => t.DocId == localdoc.DocId);

                            if (qr == null)
                            {
                                qr             = new QueryResult();
                                qr.CityId      = localdoc.CityId;
                                qr.DocId       = localdoc.DocId;
                                qr.SearchTime  = DateTime.Now;
                                qr.MeetingDate = meetingDate;
                                queries.Add(qr);
                            }

                            this.ExtractQueriesFromDoc(localdoc, ref qr);
                            Console.WriteLine("{0} docs added, {1} queries added...", docs.Count, queries.Count);
                            this.SaveMeetingResultsToSQL(docs, queries);
                        }
                    }
                }
            }
        }
Пример #11
0
        public void DownloadCouncilPdfFiles()
        {
            Regex              dateReg = new Regex("[0-9]{1,2}\\/[0-9]{1,2}\\/[0-9]{4}");
            List <Documents>   docs    = this.LoadDocumentsDoneSQL();
            List <QueryResult> queries = this.LoadQueriesDoneSQL();
            WebClient          c       = new WebClient();
            HtmlWeb            web     = new HtmlWeb();

            foreach (string url in this.docUrls)
            {
                string             category    = url.Split('*')[0];
                string             categoryUrl = url.Split('*')[1];
                HtmlDocument       categoryDoc = web.Load(categoryUrl);
                HtmlNodeCollection docNodeList = categoryDoc.DocumentNode.SelectNodes("//table[@class='tablewithheadingresponsive']/tbody/tr");

                if (docNodeList != null)
                {
                    foreach (HtmlNode lineNode in docNodeList)
                    {
                        if (lineNode.SelectSingleNode("./th") != null)
                        {
                            continue;
                        }

                        string   meetingDateText = dateReg.Match(lineNode.InnerText).ToString();
                        DateTime meetingDate     = DateTime.Parse(meetingDateText);
                        if (meetingDate < this.dtStartFrom)
                        {
                            Console.WriteLine("Too early, skip...");
                            continue;
                        }
                        HtmlNodeCollection docNodes = lineNode.SelectNodes(".//div[@class='pdf']/a");

                        if (docNodes != null)
                        {
                            foreach (HtmlNode docNode in docNodes)
                            {
                                string docLink = docNode.Attributes["href"].Value;
                                docLink = docLink.StartsWith("http") ? docLink : this.cityEntity.CityUrl + docLink;
                                Documents localDoc = docs.FirstOrDefault(t => t.DocSource == docLink);

                                if (localDoc == null)
                                {
                                    localDoc           = new Documents();
                                    localDoc.DocSource = docLink;
                                    localDoc.DocId     = Guid.NewGuid().ToString();
                                    localDoc.CityId    = this.cityEntity.CityId;
                                    localDoc.Important = false;
                                    localDoc.Checked   = false;
                                    localDoc.DocType   = category;
                                    string tag       = docNode.InnerText == "Agenda" ? "agenda" : "minute";
                                    string localFile = string.Format("{0}\\{1}_{2}_{3}.pdf", this.localDirectory, category, tag, meetingDate.ToString("yyyyMMdd"));
                                    localDoc.DocLocalPath = localFile;

                                    try
                                    {
                                        c.DownloadFile(docLink, localFile);
                                    }
                                    catch
                                    {
                                    }

                                    docs.Add(localDoc);
                                }
                                else
                                {
                                    Console.ForegroundColor = ConsoleColor.Yellow;
                                    Console.WriteLine("This file already downloaded...");
                                    Console.ResetColor();
                                }

                                this.ReadText(false, localDoc.DocLocalPath, ref localDoc);
                                QueryResult qr = queries.FirstOrDefault(t => t.DocId == localDoc.DocId);

                                if (qr == null)
                                {
                                    qr             = new QueryResult();
                                    qr.DocId       = localDoc.DocId;
                                    qr.CityId      = localDoc.CityId;
                                    qr.MeetingDate = meetingDate;
                                    qr.SearchTime  = DateTime.Now;

                                    queries.Add(qr);
                                }

                                this.ExtractQueriesFromDoc(localDoc, ref qr);
                                Console.WriteLine("{0} docs added, {1} queries added...", docs.Count, queries.Count);
                                this.SaveMeetingResultsToSQL(docs, queries);
                            }
                        }
                    }
                }
            }
        }
Пример #12
0
        public void DownloadCouncilPdfFiles()
        {
            List <Documents>   docs    = this.LoadDocumentsDoneSQL();
            List <QueryResult> queries = this.LoadQueriesDoneSQL();
            HtmlWeb            web     = new HtmlWeb();
            WebClient          c       = new WebClient();
            Regex dateReg  = new Regex("[a-zA-Z]+[\\s]+[0-9]{1,2},[\\s]{0,2}[0-9]{4}");
            Regex dateReg1 = new Regex("[a-zA-Z]+_[0-9]{1,2}_[0-9]{4}");

            foreach (string url in this.docUrls)
            {
                string             category  = url.Split('*')[0];
                string             yearUrl   = url.Split('*')[1];
                HtmlDocument       yearDoc   = web.Load(yearUrl);
                HtmlNodeCollection yearNodes = yearDoc.DocumentNode.SelectNodes("//div[@class='box sectionIntro']//div[@class='body']//a");

                for (int year = this.dtStartFrom.Year; year <= DateTime.Now.Year; year++)
                {
                    if (yearNodes != null)
                    {
                        List <HtmlNode> currentYearNodes = yearNodes.Where(t => t.InnerText.Contains(year.ToString())).ToList();

                        foreach (HtmlNode currentYearNode in currentYearNodes)
                        {
                            string             currentYearUrl = currentYearNode.Attributes["href"].Value;
                            HtmlDocument       currentYearDoc = web.Load(currentYearUrl.Replace("&amp;", "&"));
                            HtmlNodeCollection docNodes       = currentYearDoc.DocumentNode.SelectNodes("//div[@class='attachment']/a");

                            if (docNodes != null)
                            {
                                foreach (HtmlNode docNode in docNodes)
                                {
                                    string docUrl          = this.cityEntity.CityUrl + docNode.Attributes["href"].Value;
                                    string meetingDateText = dateReg.Match(docNode.InnerText).ToString();
                                    meetingDateText = string.IsNullOrEmpty(meetingDateText) ? dateReg1.Match(docUrl).ToString() : meetingDateText;
                                    Console.WriteLine("Url {0}\r\nUrl 1 {1}\r\nUrl 2 {2}\r\nDateTime {3}",
                                                      yearUrl,
                                                      currentYearUrl,
                                                      docUrl,
                                                      meetingDateText);

                                    if (String.IsNullOrEmpty(meetingDateText))
                                    {
                                        continue;
                                    }

                                    DateTime meetingDate = dateReg.IsMatch(docNode.InnerText) ?
                                                           DateTime.Parse(meetingDateText) :
                                                           DateTime.Parse(meetingDateText.Replace("_", " "));

                                    if (meetingDate < this.dtStartFrom)
                                    {
                                        Console.WriteLine("Too early, skip...");
                                        continue;
                                    }

                                    Documents localdoc = docs.FirstOrDefault(t => t.DocSource == docUrl);

                                    if (localdoc == null)
                                    {
                                        localdoc              = new Documents();
                                        localdoc.DocId        = Guid.NewGuid().ToString();
                                        localdoc.CityId       = this.cityEntity.CityId;
                                        localdoc.DocSource    = docUrl;
                                        localdoc.DocType      = category;
                                        localdoc.Checked      = false;
                                        localdoc.Important    = false;
                                        localdoc.DocLocalPath = string.Format("{0}\\{1}",
                                                                              this.localDirectory,
                                                                              docUrl.Split('/').LastOrDefault());

                                        try
                                        {
                                            c.DownloadFile(docUrl, localdoc.DocLocalPath);
                                        }
                                        catch (Exception ex) { }

                                        docs.Add(localdoc);
                                    }
                                    else
                                    {
                                        Console.WriteLine("This file already downloaded...");
                                    }

                                    this.ReadText(false, localdoc.DocLocalPath, ref localdoc);
                                    QueryResult qr = queries.FirstOrDefault(t => t.DocId == localdoc.DocId);

                                    if (qr == null)
                                    {
                                        qr             = new QueryResult();
                                        qr.CityId      = localdoc.CityId;
                                        qr.DocId       = localdoc.DocId;
                                        qr.SearchTime  = DateTime.Now;
                                        qr.MeetingDate = meetingDate;
                                        queries.Add(qr);
                                    }

                                    this.ExtractQueriesFromDoc(localdoc, ref qr);
                                    Console.WriteLine("{0} docs added, {1} queries added...", docs.Count, queries.Count);
                                    this.SaveMeetingResultsToSQL(docs, queries);
                                }
                            }
                        }
                    }
                }
            }
        }
Пример #13
0
        public void DownloadCouncilPdfFiles()
        {
            Regex              dateReg = new Regex("[A-Za-z]+[\\s]{0,1}[0-9]{1,2},[\\s]{0,1}[0-9]{4}");
            HtmlWeb            web     = new HtmlWeb();
            WebClient          c       = new WebClient();
            List <Documents>   docs    = this.LoadDocumentsDoneSQL();
            List <QueryResult> queries = this.LoadQueriesDoneSQL();

            for (int i = this.dtStartFrom.Year; i <= DateTime.Now.Year; i++)
            {
                foreach (string docUrl in docUrls)
                {
                    string category = docUrl.Split('*')[0];
                    string apiUrl   = string.Format(docUrl.Split('*')[1], i);

                    HtmlDocument       listDoc     = web.Load(apiUrl);
                    HtmlNodeCollection recordNodes = listDoc.DocumentNode.SelectNodes("//table/tbody/tr[@class='catAgendaRow']");

                    if (recordNodes != null && recordNodes.Count > 0)
                    {
                        foreach (HtmlNode recordNode in recordNodes)
                        {
                            HtmlNode dateNode    = recordNode.SelectSingleNode(".//strong");
                            string   dateText    = dateReg.Match(dateNode.InnerText).ToString();
                            DateTime meetingDate = DateTime.Parse(dateText);
                            if (meetingDate < this.dtStartFrom)
                            {
                                Console.WriteLine("Too early, skip...");
                                continue;
                            }
                            HtmlNodeCollection docUrlsNodes = recordNode.SelectNodes(".//a[contains(@href,'ViewFile')]");
                            List <string>      docFileUrls  = docUrlsNodes == null ?
                                                              null :
                                                              docUrlsNodes.Select(t => this.cityEntity.CityUrl + t.Attributes["href"].Value)
                                                              .Where(t => t.ToLower().Contains("previous") == false)
                                                              .Distinct()
                                                              .ToList();

                            if (docFileUrls != null)
                            {
                                foreach (string fileUrl in docFileUrls)
                                {
                                    Documents localdoc = docs.FirstOrDefault(t => t.DocSource == fileUrl);

                                    if (localdoc == null)
                                    {
                                        localdoc           = new Documents();
                                        localdoc.CityId    = this.cityEntity.CityId;
                                        localdoc.Checked   = false;
                                        localdoc.DocId     = Guid.NewGuid().ToString();
                                        localdoc.DocSource = fileUrl;
                                        localdoc.DocType   = category;

                                        string localFileName = string.Format("{0}\\{1}_{2}_{3}.pdf",
                                                                             this.localDirectory,
                                                                             category,
                                                                             meetingDate.ToString("yyyy-MM-dd"),
                                                                             fileUrl.ToLower().Contains("agenda") ? "agenda" : "minutes"
                                                                             );

                                        try
                                        {
                                            c.DownloadFile(fileUrl, localFileName);
                                        }
                                        catch
                                        {
                                        }

                                        localdoc.DocLocalPath = localFileName;
                                        docs.Add(localdoc);
                                    }
                                    else
                                    {
                                        Console.ForegroundColor = ConsoleColor.Yellow;
                                        Console.WriteLine("File already downloaded....");
                                        Console.ResetColor();
                                    }

                                    this.ReadText(false, localdoc.DocLocalPath, ref localdoc);
                                    QueryResult qr = queries.FirstOrDefault(t => t.DocId == localdoc.DocId);

                                    if (qr == null)
                                    {
                                        qr             = new QueryResult();
                                        qr.CityId      = this.cityEntity.CityId;
                                        qr.DocId       = localdoc.DocId;
                                        qr.MeetingDate = meetingDate;
                                        qr.SearchTime  = DateTime.Now;

                                        queries.Add(qr);
                                    }

                                    this.ExtractQueriesFromDoc(localdoc, ref qr);
                                    Console.WriteLine("{0} docs saved, {1} queries saved...", docs.Count, queries.Count);
                                }

                                this.SaveMeetingResultsToSQL(docs, queries);
                            }
                        }
                    }
                }
            }
        }
Пример #14
0
        private void ExtractDocsFromNodeList(string category, List <HtmlNode> nodeList, ref List <Documents> docs, ref List <QueryResult> queries)
        {
            WebClient c = new WebClient();

            foreach (HtmlNode docNode in nodeList)
            {
                string url = docNode.Attributes["href"].Value.StartsWith("http") ?
                             docNode.Attributes["href"].Value :
                             "http://www.lincolnpark.govoffice.com" + docNode.Attributes["href"].Value;
                DateTime meetingDate = this.ExtractDate(url);

                if (meetingDate < this.dtStartFrom)
                {
                    Console.WriteLine("Date:{0}, Earlier than {1}...", meetingDate.ToString("yyyy-MM-dd"), this.dtStartFrom.ToString("yyyy-MM-dd"));
                    continue;
                }

                Documents doc = docs.FirstOrDefault(t => t.DocSource.Contains(url));

                if (doc == null)
                {
                    Console.WriteLine("Found new document on {0}...", url);
                    doc           = new Documents();
                    doc.CityId    = this.cityEntity.CityId;
                    doc.DocId     = Guid.NewGuid().ToString();
                    doc.DocType   = category;
                    doc.DocSource = url;
                    docs.Add(doc);

                    string localFileName = doc.DocSource.Split('?').FirstOrDefault().Split('/').LastOrDefault();
                    string localPath     = string.Format("{0}\\{1}", this.localDirectory, localFileName);
                    localPath = File.Exists(localPath) ?
                                localPath.Replace(Path.GetExtension(localPath), string.Format("_{0}{1}", meetingDate.ToString("yyyy-MM-dd"), Path.GetExtension(localPath))) :
                                localPath;
                    doc.DocLocalPath = localPath;

                    try
                    {
                        c.DownloadFile(doc.DocSource, localPath);
                    }
                    catch (Exception ex)
                    {
                        continue;
                    }
                }

                this.ReadText(false, doc.DocLocalPath, ref doc);
                QueryResult qr = queries.FirstOrDefault(t => t.DocId == doc.DocId);

                if (qr == null)
                {
                    qr             = new QueryResult();
                    qr.CityId      = doc.CityId;
                    qr.DocId       = doc.DocId;
                    qr.MeetingDate = meetingDate;
                    qr.SearchTime  = DateTime.Now;

                    queries.Add(qr);
                }

                this.ExtractQueriesFromDoc(doc, ref qr);
                Console.WriteLine("{0} docs added, {1} queries added...", docs.Count, queries.Count);
            }
        }
Пример #15
0
        public void ExtractOthers(List <string> urls, ref List <Documents> docs, ref List <QueryResult> queries)
        {
            WebClient c       = new WebClient();
            Regex     dateReg = new Regex("[a-zA-Z]+[\\s]+[0-9]{1,2},[\\s]+[0-9]{4}");

            foreach (string url in urls)
            {
                string category = url.Split('*')[0];
                string listUrl  = url.Split('*')[1];
                Console.WriteLine("Working on {0}...", listUrl);

                string json    = c.DownloadString(listUrl);
                var    jsonDoc = JsonConvert.DeserializeObject(json) as JToken;

                if (jsonDoc != null)
                {
                    var fileUrlsNodes = jsonDoc.SelectTokens("$..href");

                    if (fileUrlsNodes != null)
                    {
                        foreach (var fileUrlNode in fileUrlsNodes)
                        {
                            string   fileUrl         = "https://shelbytwpmi.documents-on-demand.com" + fileUrlNode.ToString();
                            string   meetingDateText = dateReg.Match(fileUrl).ToString();
                            DateTime meetingDate     = DateTime.Parse(meetingDateText);
                            if (meetingDate < this.dtStartFrom)
                            {
                                Console.WriteLine("Too early, skip...");
                                continue;
                            }
                            Documents localdoc = docs.FirstOrDefault(t => t.DocSource == fileUrl);

                            if (localdoc == null)
                            {
                                localdoc           = new Documents();
                                localdoc.DocId     = Guid.NewGuid().ToString();
                                localdoc.CityId    = this.cityEntity.CityId;
                                localdoc.DocSource = fileUrl;
                                localdoc.Important = false;
                                localdoc.Checked   = false;
                                localdoc.DocType   = category;
                                string localPath = string.Format("{0}\\{1}", this.localDirectory, fileUrl.Split('/').LastOrDefault());
                                localdoc.DocLocalPath = localPath;

                                try
                                {
                                    c.DownloadFile(fileUrl, localPath);
                                }
                                catch { }

                                docs.Add(localdoc);
                            }
                            else
                            {
                                Console.WriteLine("This file already downloaded....");
                            }

                            this.ReadText(false, localdoc.DocLocalPath, ref localdoc);
                            QueryResult qr = queries.FirstOrDefault(t => t.DocId == localdoc.DocId);

                            if (qr == null)
                            {
                                qr             = new QueryResult();
                                qr.DocId       = localdoc.DocId;
                                qr.CityId      = localdoc.CityId;
                                qr.MeetingDate = meetingDate;
                                qr.SearchTime  = DateTime.Now;
                                queries.Add(qr);
                            }

                            this.ExtractQueriesFromDoc(localdoc, ref qr);
                            Console.WriteLine("{0} docs saved, {1} queries saved...", docs.Count, queries.Count);
                            this.SaveMeetingResultsToSQL(docs, queries);
                        }
                    }
                }
            }
        }
Пример #16
0
        public bool ReadPdf(string pdfFile, ref Documents doc, ref int pages)
        {
            bool success = false;

            try
            {
                if (pdfFile.ToLower().Contains("pdf"))
                {
                    StringBuilder textBuilder = new StringBuilder();
                    PdfReader     r           = new PdfReader(pdfFile);
                    pages = r.NumberOfPages;

                    for (int i = 1; i <= pages; i++)
                    {
                        PdfReaderContentParser  parser = new PdfReaderContentParser(r);
                        ITextExtractionStrategy st     = parser.ProcessContent <SimpleTextExtractionStrategy>(i, new SimpleTextExtractionStrategy());
                        string text = st.GetResultantText().Trim('\r', '\n', '\t', (char)32, (char)160);

                        if (!string.IsNullOrEmpty(text))
                        {
                            doc.DocBodyDic.Add(i, text);
                        }
                        else
                        {
                            text = PdfTextExtractor.GetTextFromPage(r, i).Trim('\r', '\n', '\t', (char)32, (char)160);

                            if (!string.IsNullOrEmpty(text))
                            {
                                doc.DocBodyDic.Add(i, text);
                            }
                        }
                    }

                    r.Close();
                    success = true;
                }
                else if (pdfFile.ToLower().Contains("doc"))
                {
                    MsWord.Application newApp = null;
                    MsWord.Document    msdoc  = null;

                    try
                    {
                        int retry = 2;
                        while (retry > 0)
                        {
                            try
                            {
                                //newApp = (MsWord.Application)Marshal.GetActiveObject("Word.Application");
                                newApp = newApp == null ? new MsWord.Application() : newApp;
                                System.Threading.Thread.Sleep(1000);
                                //msdoc = newApp.ActiveDocument;
                                msdoc = newApp.Documents.Open(pdfFile);
                                System.Threading.Thread.Sleep(1000);
                                object             nothing = Missing.Value;
                                MsWord.WdStatistic stat    = MsWord.WdStatistic.wdStatisticPages;
                                int num = msdoc.ComputeStatistics(stat, ref nothing);

                                for (int i = 1; i <= num; i++)
                                {
                                    if (doc.DocBodyDic.ContainsKey(i))
                                    {
                                        continue;
                                    }

                                    object objWhat  = MsWord.WdGoToItem.wdGoToPage;
                                    object objWhich = MsWord.WdGoToDirection.wdGoToAbsolute;

                                    object       objPage = (object)i;
                                    MsWord.Range range1  = msdoc.GoTo(ref objWhat, ref objWhich, ref objPage, ref nothing);
                                    MsWord.Range range2  = range1.GoToNext(MsWord.WdGoToItem.wdGoToPage);

                                    object objStart = range1.Start;
                                    object objEnd   = range2.Start;
                                    if (range1.Start == range2.Start)
                                    {
                                        objEnd = msdoc.Characters.Count;
                                    }

                                    Console.ForegroundColor = ConsoleColor.Red;
                                    Console.WriteLine("DEBUG: Path: {0}, {1}-{2}........", pdfFile, objStart, objEnd);
                                    Console.ResetColor();

                                    if ((int)objStart <= (int)objEnd)
                                    {
                                        string innerText = msdoc.Range(ref objStart, ref objEnd).Text;
                                        doc.DocBodyDic.Add(i, innerText);
                                    }
                                }

                                success = true;
                                break;
                            }
                            catch (Exception ex)
                            {
                                Console.ForegroundColor = ConsoleColor.Red;
                                Console.WriteLine("Retry to read word {0}, Exception: {1}..", pdfFile, ex.ToString());
                                Console.ResetColor();
                                System.Threading.Thread.Sleep(1000);
                                retry--;
                            }
                            finally
                            {
                                if (newApp != null)
                                {
                                    newApp.NormalTemplate.Saved = true;

                                    if (msdoc != null)
                                    {
                                        msdoc.Close(false);
                                    }

                                    newApp.Quit();
                                }
                            }
                        }
                    }
                    catch (Exception e)
                    {
                    }
                }
            }
            catch (Exception ex)
            {
            }

            return(success);
        }
Пример #17
0
        public void ExtractCouncilAgenda(List <string> urls, ref List <Documents> docs, ref List <QueryResult> queries)
        {
            HtmlWeb   web     = new HtmlWeb();
            WebClient c       = new WebClient();
            Regex     dateReg = new Regex("[a-zA-Z]+[\\s]+[0-9]{1,2},[\\s]+[0-9]{4}");

            foreach (string url in urls)
            {
                string             category    = url.Split('*')[0];
                string             listUrl     = url.Split('*')[1];
                HtmlDocument       doc         = web.Load(listUrl);
                HtmlNodeCollection recordNodes = doc.DocumentNode.SelectNodes("//div[contains(@class,'Row MeetingRow')]");

                if (recordNodes != null)
                {
                    foreach (HtmlNode recordNode in recordNodes)
                    {
                        List <HtmlNode> docNodes   = new List <HtmlNode>();
                        HtmlNode        agendaNode = recordNode.SelectSingleNode(".//a[text()='Agenda']");
                        if (agendaNode != null)
                        {
                            docNodes.Add(agendaNode);
                        }
                        HtmlNode agendePacketNode = recordNode.SelectSingleNode(".//a[text()='Agenda Packet']");
                        if (agendePacketNode != null)
                        {
                            docNodes.Add(agendePacketNode);
                        }

                        if (docNodes.Count == 0)
                        {
                            Console.WriteLine("No files found....");
                            continue;
                        }

                        HtmlNode dateNode        = recordNode.SelectSingleNode(".//div[@class='RowLink']/a");
                        string   meetingDateText = dateNode.InnerText;
                        DateTime meetingDate     = DateTime.Parse(meetingDateText);
                        if (meetingDate < this.dtStartFrom)
                        {
                            Console.WriteLine("Too early, skip...");
                            continue;
                        }
                        foreach (HtmlNode docNode in docNodes)
                        {
                            string    docUrl   = "http://shelbytownmi.iqm2.com/Citizens/" + docNode.Attributes["href"].Value.Replace("&amp;", "&");
                            Documents localdoc = docs.FirstOrDefault(t => t.DocSource == docUrl);

                            if (localdoc == null)
                            {
                                localdoc              = new Documents();
                                localdoc.DocId        = Guid.NewGuid().ToString();
                                localdoc.CityId       = this.cityEntity.CityId;
                                localdoc.DocSource    = docUrl;
                                localdoc.Important    = false;
                                localdoc.Checked      = false;
                                localdoc.DocType      = "City Council";
                                localdoc.DocLocalPath = string.Format("{0}\\Council_{1}_{2}.pdf", this.localDirectory,
                                                                      docNode.InnerText, meetingDate.ToString("yyyy-MM-dd"));

                                try
                                {
                                    c.Headers.Add("user-agent", "chrome");
                                    c.DownloadFile(docUrl, localdoc.DocLocalPath);
                                }
                                catch (Exception ex) { }

                                docs.Add(localdoc);
                            }
                            else
                            {
                                Console.WriteLine("This file already downloaded...");
                            }

                            this.ReadText(false, localdoc.DocLocalPath, ref localdoc);
                            QueryResult qr = queries.FirstOrDefault(t => t.DocId == localdoc.DocId);

                            if (qr == null)
                            {
                                qr             = new QueryResult();
                                qr.CityId      = this.cityEntity.CityId;
                                qr.DocId       = localdoc.DocId;
                                qr.SearchTime  = DateTime.Now;
                                qr.MeetingDate = meetingDate;
                                queries.Add(qr);
                            }

                            this.ExtractQueriesFromDoc(localdoc, ref qr);
                            Console.WriteLine("{0} docs added, {1} queries added...", docs.Count, queries.Count);
                            this.SaveMeetingResultsToSQL(docs, queries);
                        }
                    }
                }
            }
        }
Пример #18
0
        public void OCRPdf(bool rotate, string docPath, ref Documents doc)
        {
            PdfReader pdfReader = new PdfReader(docPath);
            int       totalPage = pdfReader.NumberOfPages;

            Console.WriteLine("Pdf file {0} contains {1} pages...", docPath, totalPage);
            List <int> pageNos = new List <int>();

            for (int i = 1; i <= totalPage; i++)
            {
                if (!doc.DocBodyDic.ContainsKey(i))
                {
                    pageNos.Add(i);
                }
            }

            foreach (int pageNumber in pageNos)
            {
                try
                {
                    Console.WriteLine("Working on page {0}...", pageNumber);
                    PdfReader     pdf  = new PdfReader(docPath);
                    PdfDictionary pg   = pdf.GetPageN(pageNumber);
                    PdfDictionary res  = (PdfDictionary)PdfReader.GetPdfObject(pg.Get(PdfName.RESOURCES));
                    PdfDictionary xobj = (PdfDictionary)PdfReader.GetPdfObject(res.Get(PdfName.XOBJECT));
                    foreach (PdfName name in xobj.Keys)
                    {
                        PdfObject obj = xobj.Get(name);

                        if (obj.IsIndirect())
                        {
                            PdfDictionary tg          = (PdfDictionary)PdfReader.GetPdfObject(obj);
                            string        width       = tg.Get(PdfName.WIDTH).ToString();
                            float         widthValue  = float.Parse(width);
                            string        height      = tg.Get(PdfName.HEIGHT).ToString();
                            float         heightValue = -1;
                            bool          isDigit     = float.TryParse(height, out heightValue);
                            heightValue = isDigit ? heightValue : widthValue;

                            if (heightValue < 100 || widthValue < 100)
                            {
                                continue;
                            }

                            ImageRenderInfo imgRI         = ImageRenderInfo.CreateForXObject(new Matrix(float.Parse(width), heightValue), (PRIndirectReference)obj, tg);
                            PdfImageObject  image         = imgRI.GetImage();
                            string          imageFileName = string.Empty;

                            using (Image dotnetImg = image.GetDrawingImage())
                            {
                                if (dotnetImg != null)
                                {
                                    using (MemoryStream ms = new MemoryStream())
                                    {
                                        dotnetImg.Save(ms, ImageFormat.Jpeg);
                                    }
                                }

                                string ocrFolder = string.Format("{0}\\{1}", this.localDirectory, Path.GetFileNameWithoutExtension(docPath));

                                if (!Directory.Exists(ocrFolder))
                                {
                                    Directory.CreateDirectory(ocrFolder);
                                }

                                imageFileName = string.Format("{0}\\{1}\\Page_{2}.jpg", localDirectory, Path.GetFileNameWithoutExtension(docPath), pageNumber);
                                dotnetImg.Save(imageFileName);
                            }

                            //string text = RunOCRCommand(imageFileName);
                            string text = RetryText(imageFileName);

                            if ((!doc.DocBodyDic.ContainsKey(pageNumber)) && (!string.IsNullOrEmpty(text)))
                            {
                                doc.DocBodyDic.Add(pageNumber, text);
                            }
                            else
                            {
                                Console.ForegroundColor = ConsoleColor.Yellow;
                                Console.WriteLine("Page {0} could read...", pageNumber);
                                Console.ResetColor();
                            }
                        }
                    }
                }
                catch (Exception ex)
                {
                }
            }
            pdfReader.Close();
        }
Пример #19
0
        public void DownloadCouncilPdfFiles()
        {
            WebClient          c       = new WebClient();
            HtmlWeb            web     = new HtmlWeb();
            List <QueryResult> queries = this.LoadQueriesDoneSQL();
            List <Documents>   docs    = this.LoadDocumentsDoneSQL();

            Console.WriteLine("Will download pdf files from {0}...", this.dtStartFrom.ToString("yyyy-MM-dd"));

            foreach (string key in meetingUrlMap.Keys)
            {
                Console.WriteLine("Working on {0} files...", key);
                string             meetingUrl  = meetingUrlMap[key];
                HtmlDocument       doc         = web.Load(meetingUrl);
                HtmlNodeCollection docNodeList = doc.DocumentNode.SelectNodes("//a[text()='Agenda']/ancestor::p");

                if (docNodeList != null)
                {
                    foreach (HtmlNode docNode in docNodeList)
                    {
                        Regex  dateReg  = new Regex("[a-zA-Z]+[\\s]{0,2}[0-9]+,[\\s]{0,2}[0-9]+");
                        string dateText = dateReg.IsMatch(docNode.InnerText) ?
                                          dateReg.Match(docNode.InnerText).ToString() :
                                          string.Empty;

                        if (DateTime.Parse(dateText) < this.dtStartFrom)
                        {
                            Console.WriteLine("Earlier than {0}, skip...", dtStartFrom);
                            continue;
                        }

                        var targetLinkNodes = docNode.SelectNodes(".//a[@href]").Where(t =>
                                                                                       t.InnerText == "Agenda" || t.InnerText == "Minutes" || t.InnerText == "Approved Minutes");

                        if (targetLinkNodes != null && targetLinkNodes.Count() > 0)
                        {
                            foreach (HtmlNode docLinkNode in targetLinkNodes)
                            {
                                string docUrl = docLinkNode.Attributes["href"].Value;
                                docUrl = docUrl.StartsWith("http") ? docUrl : string.Format("http://www.cityofwarren.org{0}", docUrl);

                                Documents localdoc = docs.FirstOrDefault(t => t.DocSource.Contains(docUrl));
                                if (localdoc == null)
                                {
                                    localdoc         = new Documents();
                                    localdoc.CityId  = cityEntity.CityId;
                                    localdoc.DocId   = Guid.NewGuid().ToString();
                                    localdoc.DocType = key;
                                    string localDocPath = string.Format("{0}\\{1}", localDirectory, docUrl.Split('?').FirstOrDefault().Split('/').LastOrDefault());
                                    localdoc.DocSource    = docUrl;
                                    localdoc.DocLocalPath = localDocPath;

                                    try
                                    {
                                        c.DownloadFile(docUrl, localDocPath);
                                    }
                                    catch (Exception ex)
                                    {
                                        Console.WriteLine("File {0} failed to download due to {1}...", docUrl, ex.ToString());
                                        continue;
                                    }

                                    docs.Add(localdoc);
                                }

                                this.ReadText(false, localdoc.DocLocalPath, ref localdoc);
                                #region extract to another method ReadText()
                                //string content = string.Empty;
                                //bool readable = this.ReadPdf(localdoc.DocLocalPath, out content);

                                //if (readable == false || (readable = true && string.IsNullOrEmpty(content)))
                                //{
                                //    Console.ForegroundColor = ConsoleColor.Yellow;
                                //    Console.WriteLine("File {0} cannot read, OCR!", localdoc.DocLocalPath);
                                //    Console.ResetColor();
                                //    Dictionary<int, string> docBodyDic = new Dictionary<int, string>();
                                //    this.OCRPdf(localdoc.DocLocalPath, ref docBodyDic);
                                //    StringBuilder contentBuilder = new StringBuilder();

                                //    foreach (int page in docBodyDic.Keys)
                                //    {
                                //        contentBuilder.AppendFormat("{0} ", docBodyDic[page].ToString());
                                //    }

                                //    content = contentBuilder.ToString();
                                //}

                                //localdoc.DocBody = content;
                                //localdoc.Readable = readable;
                                #endregion
                                QueryResult qr = queries.FirstOrDefault(t => t.DocId == localdoc.DocId);

                                if (qr == null)
                                {
                                    qr             = new QueryResult();
                                    qr.QueryId     = Guid.NewGuid().ToString();
                                    qr.DocId       = localdoc.DocId;
                                    qr.SearchTime  = DateTime.Now;
                                    qr.MeetingDate = DateTime.Parse(dateText);
                                    qr.CityId      = this.cityEntity.CityId;

                                    queries.Add(qr);
                                }

                                this.ExtractQueriesFromDoc(localdoc, ref qr);

                                Console.WriteLine("{0} query results saved...", queries.Count);
                                Console.WriteLine("{0} docs saved...", docs.Count);
                            }
                        }
                    }
                }
            }

            queries.RemoveAll(t => t.Entries.Count == 0);
            this.SaveMeetingResultsToSQL(docs, queries);
        }
Пример #20
0
        protected virtual void ExtractADoc(WebClient c, string docUrl, string category, string fileType, DateTime meetingDate, ref List <Documents> docs, ref List <QueryResult> queries)
        {
            this.DealWithFileName(ref category);
            Documents localdoc = docs.FirstOrDefault(t => t.DocSource == docUrl);
            string    xpath    = fileType.Split(':').LastOrDefault();

            fileType = fileType.Split(':').FirstOrDefault();

            if (localdoc == null)
            {
                localdoc              = new Documents();
                localdoc.DocSource    = docUrl;
                localdoc.CityId       = this.cityEntity.CityId;
                localdoc.DocId        = Guid.NewGuid().ToString();
                localdoc.DocType      = category;
                localdoc.Important    = false;
                localdoc.Checked      = false;
                localdoc.DocLocalPath = string.Format("{0}\\{1}_{2}_{3}.{4}",
                                                      this.localDirectory,
                                                      category,
                                                      meetingDate.ToString("yyyy-MM-dd"),
                                                      Guid.NewGuid().ToString(),
                                                      fileType);

                try
                {
                    c.Headers.Add("user-agent", "chrome");
                    c.DownloadFile(docUrl, localdoc.DocLocalPath);
                }
                catch (Exception ex)
                {
                    if (ex.ToString().Contains("404"))
                    {
                        Console.ForegroundColor = ConsoleColor.Red;
                        Console.WriteLine("NOT FOUND......");
                        Console.ResetColor();
                    }

                    Console.ForegroundColor = ConsoleColor.DarkCyan;
                    Console.WriteLine("Failed to download file {0}...", docUrl);
                    Console.WriteLine("ERROR: {0}", ex.ToString());
                    Console.ResetColor();
                    return;
                }

                docs.Add(localdoc);
            }
            else
            {
                Console.WriteLine("This file already downloaded...");
            }
            if (fileType != "html")
            {
                this.ReadText(false, localdoc.DocLocalPath, ref localdoc);
            }
            else
            {
                HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
                doc.LoadHtml(File.ReadAllText(localdoc.DocLocalPath));
                localdoc.DocBodyDic.Add(1, doc.DocumentNode.SelectSingleNode(xpath).InnerText);
            }

            QueryResult qr = queries.FirstOrDefault(t => t.DocId == localdoc.DocId);

            if (qr == null)
            {
                qr             = new QueryResult();
                qr.QueryId     = Guid.NewGuid().ToString();
                qr.CityId      = this.cityEntity.CityId;
                qr.DocId       = localdoc.DocId;
                qr.MeetingDate = meetingDate;
                qr.SearchTime  = DateTime.Now;
                queries.Add(qr);
            }

            if (qr.MeetingDate == DateTime.MinValue)
            {
                List <Regex> dateRegList = new List <Regex>();
                dateRegList.Add(new Regex("[a-zA-Z]{3,}[\\s]{1}[0-9]{1,2},[\\s]{1}[0-9]{4}"));
                dateRegList.Add(new Regex("[a-zA-Z]{3,}[\\s]{1}[0-9]{1,2}[\\s]{1}[0-9]{4}"));
                dateRegList.Add(new Regex("[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}"));
                dateRegList.Add(new Regex("[a-zA-Z]+[\\s]{1,2}[0-9]{1,2}(st|ST|nd|ND|rd|RD|th|TH)[,]{0,1}[\\s]{0,2}[0-9]{4}"));

                foreach (Regex dateReg in dateRegList)
                {
                    Console.WriteLine("Match meeting date again...");

                    if (localdoc.DocBodyDic.Count > 0)
                    {
                        string meetingDateText = localdoc.DocBodyDic.FirstOrDefault().Value;
                        Console.WriteLine("DEBUG:{0}", meetingDateText);
                        if (dateReg.IsMatch(meetingDateText))
                        {
                            if (dateRegList.IndexOf(dateReg) == dateRegList.Count - 1)
                            {
                                meetingDateText = meetingDateText.ToLower().Replace("th", string.Empty).Replace("rd", string.Empty).Replace("nd", string.Empty).Replace("st", string.Empty);
                            }

                            Console.WriteLine("Match meeting date succeefully.");
                            meetingDateText = dateReg.Match(meetingDateText).ToString().Replace("Sept ", "Sep ");
                            Console.WriteLine("DEBUG:{0}", meetingDateText);
                            meetingDate    = DateTime.Parse(meetingDateText.ToLower());
                            qr.MeetingDate = meetingDate;
                            break;
                        }
                    }
                }
            }

            this.ExtractQueriesFromDoc(localdoc, ref qr);
            Console.WriteLine("{0} docs added, {1} queries added...", docs.Count, queries.Count);
            //  this.SaveMeetingResultsToSQL(docs, queries);
        }
Пример #21
0
        public void DownloadCouncilPdfFiles()
        {
            List <Documents>   docs    = this.LoadDocumentsDoneSQL();
            List <QueryResult> queries = this.LoadQueriesDoneSQL();
            HtmlWeb            web     = new HtmlWeb();
            WebClient          c       = new WebClient();
            Regex dateReg  = new Regex("(0|1)[0-9]{7}");
            Regex dateReg1 = new Regex("[a-zA-Z]+[\\s]{0,1}[0-9]{1,2},[0-9]{4}");

            foreach (string url in this.docUrls)
            {
                Console.WriteLine("Working on {0}...", url);

                string category    = url.Split('*')[0];
                string categoryUrl = url.Split('*')[1];
                string dataJson    = c.DownloadString(categoryUrl);
                JToken dataToken   = JsonConvert.DeserializeObject(dataJson) as JToken;

                if (dataToken != null)
                {
                    var docTokens = dataToken.SelectTokens("$..href");

                    if (docTokens != null)
                    {
                        foreach (JToken docToken in docTokens)
                        {
                            string   docUrl      = "https://harrisontwpmi.documents-on-demand.com" + docToken.ToString();
                            DateTime meetingDate = DateTime.MinValue;

                            if (dateReg.IsMatch(docUrl))
                            {
                                string meetingDateText = dateReg.Match(docUrl).ToString();
                                meetingDate = DateTime.ParseExact(meetingDateText, "MMddyyyy", null);
                            }
                            else if (dateReg1.IsMatch(docUrl))
                            {
                                string meetingDateText = dateReg1.Match(docUrl).ToString();
                                meetingDate = DateTime.Parse(meetingDateText);
                            }
                            if (meetingDate < this.dtStartFrom)
                            {
                                Console.WriteLine("Too early, skip...");
                                continue;
                            }
                            Documents localdoc = docs.FirstOrDefault(t => t.DocSource == docUrl);

                            if (localdoc == null)
                            {
                                localdoc              = new Documents();
                                localdoc.DocId        = Guid.NewGuid().ToString();
                                localdoc.CityId       = this.cityEntity.CityId;
                                localdoc.DocType      = category;
                                localdoc.DocSource    = docUrl;
                                localdoc.Checked      = false;
                                localdoc.Important    = false;
                                localdoc.DocLocalPath = string.Format("{0}\\{1}", this.localDirectory, docUrl.Split('/').LastOrDefault());

                                try
                                {
                                    c.DownloadFile(docUrl, localdoc.DocLocalPath);
                                }
                                catch (Exception ex)
                                {
                                }

                                docs.Add(localdoc);
                            }
                            else
                            {
                                Console.WriteLine("This file already downloaded...");
                            }

                            this.ReadText(false, localdoc.DocLocalPath, ref localdoc);
                            QueryResult qr = queries.FirstOrDefault(t => t.DocId == localdoc.DocId);

                            if (qr == null)
                            {
                                qr             = new QueryResult();
                                qr.MeetingDate = meetingDate;
                                qr.SearchTime  = DateTime.Now;
                                qr.CityId      = this.cityEntity.CityId;
                                qr.DocId       = localdoc.DocId;
                                queries.Add(qr);
                            }

                            this.ExtractQueriesFromDoc(localdoc, ref qr);
                            Console.WriteLine("{0} docs added, {1} queries added...", docs.Count, queries.Count);
                            this.SaveMeetingResultsToSQL(docs, queries);
                        }
                    }
                }
            }
        }
Пример #22
0
        public void DownloadCouncilPdfFiles()
        {
            //tsl12 - 3072; tsl11 - 768
            ServicePointManager.SecurityProtocol = (SecurityProtocolType)3072;
            List <Documents>   docs    = this.LoadDocumentsDoneSQL();
            List <QueryResult> queries = this.LoadQueriesDoneSQL();
            WebClient          c       = new WebClient();
            ChromeDriver       cd      = new ChromeDriver();
            Regex dateReg = new Regex("[a-zA-Z]+[\\s]+[0-9]{1,2},[\\s]+[0-9]{4}");

            foreach (string docUrl in this.docUrls)
            {
                string category    = docUrl.Split('*')[0];
                string categoryUrl = docUrl.Split('*')[1] + "?limit=0";
                cd.Navigate().GoToUrl(categoryUrl);
                System.Threading.Thread.Sleep(3000);
                HtmlDocument doc = new HtmlDocument();
                doc.LoadHtml(cd.PageSource);
                HtmlNodeCollection meetingNodes = doc.DocumentNode.SelectNodes("//td[@headers='tableOrdering']/a");

                if (meetingNodes != null)
                {
                    foreach (HtmlNode meetingNode in meetingNodes)
                    {
                        string   meetingDateText = dateReg.Match(meetingNode.InnerText).ToString();
                        DateTime meetingDate     = DateTime.Parse(meetingDateText);

                        if (meetingDate < this.dtStartFrom)
                        {
                            Console.WriteLine("Too early...");
                            continue;
                        }

                        string    fileUrl  = this.cityEntity.CityUrl + meetingNode.Attributes["href"].Value + "?format=pdf";
                        Documents localdoc = docs.FirstOrDefault(t => t.DocSource == fileUrl);

                        if (localdoc == null)
                        {
                            localdoc              = new Documents();
                            localdoc.DocId        = Guid.NewGuid().ToString();
                            localdoc.DocType      = category;
                            localdoc.CityId       = this.cityEntity.CityId;
                            localdoc.Important    = false;
                            localdoc.Checked      = false;
                            localdoc.DocSource    = fileUrl;
                            localdoc.DocLocalPath = string.Format("{0}\\{1}.pdf",
                                                                  this.localDirectory,
                                                                  fileUrl.Split('?').FirstOrDefault().Split('/').LastOrDefault());

                            try
                            {
                                c.Headers.Add("user-agent", "chrome");
                                c.DownloadFile(fileUrl, localdoc.DocLocalPath);
                            }
                            catch (Exception ex)
                            { }

                            docs.Add(localdoc);
                        }
                        else
                        {
                            Console.WriteLine("This file already downloaded....");
                        }

                        this.ReadText(false, localdoc.DocLocalPath, ref localdoc);
                        QueryResult qr = queries.FirstOrDefault(t => t.DocId == localdoc.DocId);

                        if (qr == null)
                        {
                            qr             = new QueryResult();
                            qr.CityId      = this.cityEntity.CityId;
                            qr.DocId       = localdoc.DocId;
                            qr.MeetingDate = meetingDate;
                            qr.SearchTime  = DateTime.Now;
                            queries.Add(qr);
                        }

                        this.ExtractQueriesFromDoc(localdoc, ref qr);
                        Console.WriteLine("{0} docs added, {1} queries added...", docs.Count, queries.Count);
                        this.SaveMeetingResultsToSQL(docs, queries);
                    }
                }
            }

            cd.Quit();
            cd = null;
        }
Пример #23
0
        private void ExtractMinutes(HtmlDocument doc, string category, ref List <Documents> docs, ref List <QueryResult> queries)
        {
            WebClient          c = new WebClient();
            HtmlNodeCollection docNodeCollection = doc.DocumentNode.SelectNodes("//table[@class='tablewithheadingresponsive']/tbody/tr[@class]");

            if (docNodeCollection != null)
            {
                foreach (HtmlNode destinationNode in docNodeCollection)
                {
                    HtmlNode           meetingDateNode = destinationNode.SelectSingleNode("./td");
                    string             meetingDateText = meetingDateNode.InnerText.Split('-').FirstOrDefault().Trim((char)32, (char)160);
                    DateTime           meetingDate     = string.IsNullOrEmpty(meetingDateText) ? DateTime.MinValue : DateTime.Parse(meetingDateText);
                    HtmlNodeCollection minuteDocNode   = destinationNode.SelectNodes(".//div[@class='pdf']/a[text()='Minutes']");

                    if (minuteDocNode == null || minuteDocNode.Count == 0)
                    {
                        continue;
                    }

                    foreach (HtmlNode docNode in minuteDocNode)
                    {
                        string pdfUrl = docNode.Attributes["href"].Value;
                        pdfUrl = !pdfUrl.StartsWith("http") ? this.cityEntity.CityUrl + pdfUrl : pdfUrl;
                        Documents localDoc = docs.FirstOrDefault(t => t.DocSource == pdfUrl);

                        if (localDoc == null)
                        {
                            localDoc           = new Documents();
                            localDoc.DocId     = Guid.NewGuid().ToString();
                            localDoc.DocSource = pdfUrl;
                            localDoc.CityId    = this.cityEntity.CityId;
                            localDoc.DocType   = category;
                            string localFilePath = string.Format("{0}\\Minutes_{1}_{2}.pdf",
                                                                 this.localDirectory,
                                                                 Path.GetFileNameWithoutExtension(pdfUrl.Split('?').FirstOrDefault().Split('/').LastOrDefault()),
                                                                 Guid.NewGuid().ToString());
                            localDoc.DocLocalPath = localFilePath;
                            localDoc.Checked      = false;

                            try
                            {
                                c.DownloadFile(pdfUrl, localFilePath);
                            }
                            catch
                            {
                            }

                            docs.Add(localDoc);
                        }
                        else
                        {
                            Console.ForegroundColor = ConsoleColor.Yellow;
                            Console.WriteLine("This document already downloaded...");
                            Console.ResetColor();
                        }

                        this.ReadText(false, localDoc.DocLocalPath, ref localDoc);
                        QueryResult qr = queries.FirstOrDefault(t => t.DocId == localDoc.DocId);

                        if (qr == null)
                        {
                            qr             = new QueryResult();
                            qr.CityId      = localDoc.CityId;
                            qr.SearchTime  = DateTime.Now;
                            qr.MeetingDate = meetingDate;
                            qr.DocId       = localDoc.DocId;

                            queries.Add(qr);
                        }

                        this.ExtractQueriesFromDoc(localDoc, ref qr);
                        Console.WriteLine("{0} docs added, {1} queries added...", docs.Count, queries.Count);
                    }
                }
            }

            this.SaveMeetingResultsToSQL(docs, queries);
        }
Пример #24
0
        public void ExtractCouncil(string url, ref List <Documents> docs, ref List <QueryResult> queries)
        {
            WebClient          c                = new WebClient();
            HtmlWeb            web              = new HtmlWeb();
            Regex              dateReg          = new Regex("[a-zA-Z]+[\\s]{0,1}[0-9]{1,2},[\\s]{0,1}[0-9]+");
            HtmlDocument       doc              = web.Load(url);
            HtmlNodeCollection pdfFileNodes     = doc.DocumentNode.SelectNodes("//a[contains(@href,'pdf')]");
            List <HtmlNode>    targetFilesNodes = null;
            List <int>         years            = new List <int>();

            for (int i = this.dtStartFrom.Year; i <= DateTime.Now.Year; i++)
            {
                years.Add(i);
            }

            if (pdfFileNodes != null)
            {
                targetFilesNodes = pdfFileNodes.Where(t => years.Exists(y => t.InnerText.Contains(y.ToString()))).ToList();
            }

            foreach (HtmlNode pdfFileNode in targetFilesNodes)
            {
                string   dateText    = dateReg.Match(pdfFileNode.InnerText).ToString();
                DateTime meetingDate = string.IsNullOrEmpty(dateText) ?
                                       DateTime.MinValue :
                                       DateTime.Parse(dateText);
                string pdfUrl = pdfFileNode.Attributes["href"].Value;
                pdfUrl = pdfUrl.StartsWith("http") ? pdfUrl : this.cityEntity.CityUrl.Trim('/') + "/" + pdfUrl.TrimStart('/');
                Documents localdoc = docs.FirstOrDefault(t => t.DocSource == pdfUrl);

                if (localdoc == null)
                {
                    localdoc              = new Documents();
                    localdoc.Important    = false;
                    localdoc.Checked      = false;
                    localdoc.CityId       = this.cityEntity.CityId;
                    localdoc.DocId        = Guid.NewGuid().ToString();
                    localdoc.DocSource    = pdfUrl;
                    localdoc.DocType      = "City Council";
                    localdoc.DocLocalPath = string.Format("{0}\\City Council_{1}",
                                                          this.localDirectory,
                                                          pdfUrl.Split('?').LastOrDefault().Split('/').LastOrDefault());

                    try
                    {
                        c.DownloadFile(pdfUrl, localdoc.DocLocalPath);
                    }
                    catch { }

                    docs.Add(localdoc);
                }
                else
                {
                    Console.WriteLine("This file already downloaded...");
                }

                this.ReadText(false, localdoc.DocLocalPath, ref localdoc);

                if (meetingDate == DateTime.MinValue)
                {
                    if (localdoc.DocBodyDic.Count > 0)
                    {
                        if (dateReg.IsMatch(localdoc.DocBodyDic[1]))
                        {
                            meetingDate = DateTime.Parse(dateReg.Match(localdoc.DocBodyDic[1]).ToString());
                        }
                    }
                }

                QueryResult qr = queries.FirstOrDefault(t => t.DocId == localdoc.DocId);

                if (qr == null)
                {
                    qr             = new QueryResult();
                    qr.CityId      = localdoc.CityId;
                    qr.DocId       = localdoc.DocId;
                    qr.SearchTime  = DateTime.Now;
                    qr.MeetingDate = meetingDate;

                    queries.Add(qr);
                }

                this.ExtractQueriesFromDoc(localdoc, ref qr);
                Console.WriteLine("{0} docs added, {1} queries added...", docs.Count, queries.Count);
                this.SaveMeetingResultsToSQL(docs, queries);
            }
        }
Пример #25
0
        private void ExtractMoreAgenda(HtmlDocument moreDoc, string category, DateTime meetingDate, ref List <Documents> docs, ref List <QueryResult> queries)
        {
            WebClient          c           = new WebClient();
            HtmlNodeCollection docNodeList = moreDoc.DocumentNode.SelectNodes("//img[@src='/Images/IconsClipArtGraphics/Icon-PDFSmall.aspx']/following-sibling::a");

            if (docNodeList != null && docNodeList.Count > 0)
            {
                foreach (HtmlNode docNode in docNodeList)
                {
                    string pdfUrl = docNode.Attributes["href"].Value;
                    pdfUrl = !pdfUrl.StartsWith("http") ? this.cityEntity.CityUrl + pdfUrl : pdfUrl;
                    Documents localDoc = docs.FirstOrDefault(t => t.DocSource == pdfUrl);

                    if (localDoc == null)
                    {
                        localDoc           = new Documents();
                        localDoc.DocId     = Guid.NewGuid().ToString();
                        localDoc.DocSource = pdfUrl;
                        localDoc.CityId    = this.cityEntity.CityId;
                        localDoc.DocType   = category;
                        string localFilePath = string.Format("{0}\\Agenda_{1}_{2}.pdf",
                                                             this.localDirectory,
                                                             Path.GetFileNameWithoutExtension(pdfUrl.Split('?').FirstOrDefault().Split('/').LastOrDefault()),
                                                             Guid.NewGuid().ToString());
                        localDoc.DocLocalPath = localFilePath;
                        localDoc.Checked      = false;

                        try
                        {
                            c.DownloadFile(pdfUrl, localFilePath);
                        }
                        catch
                        {
                        }

                        docs.Add(localDoc);
                    }
                    else
                    {
                        Console.ForegroundColor = ConsoleColor.Yellow;
                        Console.WriteLine("The file already downloaded...");
                        Console.ResetColor();
                    }

                    this.ReadText(false, localDoc.DocLocalPath, ref localDoc);
                    QueryResult qr = queries.FirstOrDefault(t => t.DocId == localDoc.DocId);

                    if (qr == null)
                    {
                        qr             = new QueryResult();
                        qr.DocId       = localDoc.DocId;
                        qr.CityId      = localDoc.CityId;
                        qr.MeetingDate = meetingDate;
                        qr.SearchTime  = DateTime.Now;

                        queries.Add(qr);
                    }

                    this.ExtractQueriesFromDoc(localDoc, ref qr);
                    Console.WriteLine("{0} docs added, {1} queries added...", docs.Count, queries.Count);
                }
            }

            this.SaveMeetingResultsToSQL(docs, queries);
        }
Пример #26
0
        public void DownloadZBAFiles(ref List <Documents> docs, ref List <QueryResult> queries)
        {
            //List<string> zbaUrls = this.GetZBAMeetingsThisMonth();
            HtmlWeb   web         = new HtmlWeb();
            WebClient c           = new WebClient();
            string    zbaUrl      = this.docUrls.FirstOrDefault(t => t.StartsWith("Zoning Board of Appeals"));
            string    zbaTemplate = zbaUrl.Split('*')[1];
            string    earliestUrl = string.Empty;
            string    latestUrl   = string.Empty;

            this.GetEarliestMeeting(this.zbaCalendarUrl, ref earliestUrl, ref latestUrl);
            Regex digitReg = new Regex("[0-9]+");
            int   early    = int.Parse(digitReg.Matches(earliestUrl).Cast <Match>().LastOrDefault().ToString());
            int   end      = int.Parse(digitReg.Matches(latestUrl).Cast <Match>().LastOrDefault().ToString());

            if (string.IsNullOrEmpty(latestUrl))
            {
                Console.WriteLine("No more meetings...");
                return;
            }

            for (int i = early; ; i++)
            {
                if (this.listNoFiles.Contains(i))
                {
                    continue;
                }

                string       zba = string.Format(zbaTemplate, i);
                HtmlDocument doc = web.Load(zba);

                HtmlNode dateNode = doc.DocumentNode.SelectSingleNode("//*[text()='Start Date/Time:']/parent::div");
                dateNode = dateNode == null ? null : dateNode.NextSibling.NextSibling;

                if (dateNode == null && i > end)
                {
                    Console.WriteLine("The last meeting...");
                    break;
                }

                if (dateNode == null)
                {
                    this.listNoFiles.Add(i);
                    Console.WriteLine("No meetings...");
                    continue;
                }

                DateTime dtMeeting = DateTime.Parse(dateNode.InnerText);

                var             docNodesCollection = doc.DocumentNode.SelectNodes("//a[@href]");
                List <HtmlNode> fileLinksNodes     = docNodesCollection != null?
                                                     docNodesCollection.Where(t =>
                                                                              t.Attributes["href"].Value.ToLower().Contains(".pdf") ||
                                                                              t.Attributes["href"].Value.ToLower().Contains(".doc") ||
                                                                              t.Attributes["href"].Value.ToLower().Contains("fileticket"))
                                                     .ToList() : null;

                if (fileLinksNodes != null && fileLinksNodes.Count > 0)
                {
                    foreach (HtmlNode fileNode in fileLinksNodes)
                    {
                        string pdfUrl = fileNode.Attributes["href"].Value;
                        pdfUrl = pdfUrl.StartsWith("http") ? pdfUrl : "http://www.detroitmi.gov" + fileNode.Attributes["href"].Value;
                        Documents localDoc = docs.FirstOrDefault(t => t.DocSource.Contains(pdfUrl));

                        if (localDoc == null)
                        {
                            localDoc       = new Documents();
                            localDoc.DocId = Guid.NewGuid().ToString();
                            HtmlNode categoryNode = doc.DocumentNode.SelectSingleNode("//*[text()='Category:']/parent::div");
                            categoryNode = categoryNode != null ? categoryNode.NextSibling.NextSibling : categoryNode;

                            if (categoryNode == null || categoryNode.InnerText.ToLower().Contains("zoning ") == false)
                            {
                                continue;
                            }

                            localDoc.DocType = categoryNode == null ? string.Empty : categoryNode.InnerText.Trim('\t', '\n', '\r');
                            string localFile = pdfUrl.ToLower().Contains("fileticket") ?
                                               pdfUrl.Split('&').FirstOrDefault().Split('=').LastOrDefault().Replace("%", string.Empty) :
                                               pdfUrl.Split('?').FirstOrDefault().Split('/').LastOrDefault();
                            string localFileFull = pdfUrl.ToLower().Contains("fileticket") ?
                                                   string.Format("{0}\\Zoning_{1}_{2}.pdf", this.localDirectory, localFile, dtMeeting.ToString("yyyy-MM-dd")) :
                                                   string.Format("{0}\\{1}", localDirectory, localFile);
                            localDoc.DocLocalPath = localFileFull;
                            localDoc.CityId       = this.cityEntity.CityId;
                            localDoc.DocSource    = pdfUrl;

                            try
                            {
                                c.DownloadFile(pdfUrl, localFileFull);
                            }
                            catch (Exception)
                            {
                                continue;
                            }

                            docs.Add(localDoc);
                        }

                        this.ReadText(false, localDoc.DocLocalPath, ref localDoc);
                        Console.ForegroundColor = ConsoleColor.Yellow;
                        Console.WriteLine("Now search in doc {0}.", localDoc.DocLocalPath);
                        Console.ResetColor();
                        QueryResult qr = queries.FirstOrDefault(q => q.DocId == localDoc.DocId);

                        if (qr == null)
                        {
                            qr = this.ExtractMeetingInformation(doc, localDoc);
                            queries.Add(qr);
                        }
                        else
                        {
                            this.ExtractQueriesFromDoc(localDoc, ref qr);
                        }

                        Console.WriteLine("{0} queries added, {1} docs added...", queries.Count, docs.Count);
                    }
                }
                else
                {
                    this.listNoFiles.Add(i);
                    File.WriteAllLines(string.Format("{0}_NoFiles.txt", this.GetType().Name), listNoFiles.Select(t => t.ToString()));
                    Console.WriteLine("No files on {0}...", zba);
                }

                largest = i;
            }
        }
Пример #27
0
        public void DownloadCouncilPdfFiles()
        {
            List <Documents>   docs    = this.LoadDocumentsDoneSQL();
            List <QueryResult> queries = this.LoadQueriesDoneSQL();
            HtmlWeb            web     = new HtmlWeb();
            Regex     dateReg          = new Regex("[0-9]{1,2}\\.[0-9]{1,2}\\.[0-9]{2}");
            WebClient c = new WebClient();

            foreach (string url in this.docUrls)
            {
                string date     = string.Empty;
                string category = string.Empty;

                if (url.Contains("*"))
                {
                    date     = url.Split('*')[0];
                    category = "Council";
                }
                else
                {
                    if (url.Contains("council"))
                    {
                        category = "Council";
                    }
                    else
                    {
                        category = "Planning Commission";
                    }
                }

                List <HtmlNode> docNodeList = null;
                if (string.IsNullOrEmpty(date))
                {
                    HtmlDocument listDoc = web.Load(url);
                    docNodeList = listDoc.DocumentNode.SelectNodes("//ul[@class='linklist ']/li/a").ToList();
                }
                else
                {
                    docNodeList = new List <HtmlNode>();
                    docNodeList.Add(HtmlNode.CreateNode(string.Format("<a href='{0}'>{1}</a>", url.Split('*')[1], date)));
                }

                if (docNodeList != null)
                {
                    Console.WriteLine("{0} files...", docNodeList.Count);

                    foreach (HtmlNode docNode in docNodeList)
                    {
                        string   dateText    = string.Join(" ", docNode.InnerText.Trim('\t', '\r', '\n', (char)32, (char)160).Split(' ').Take(3));
                        DateTime meetingDate = DateTime.MinValue;
                        try
                        {
                            meetingDate = string.IsNullOrEmpty(date) ? DateTime.Parse(dateText) : DateTime.Parse(date);
                        }
                        catch
                        {
                            dateText    = dateReg.Match(dateText).ToString();
                            meetingDate = DateTime.ParseExact(dateText, "M.d.yy", null);
                        }

                        if (meetingDate < this.dtStartFrom)
                        {
                            Console.WriteLine("{0} earlier than {1}. Skip...", meetingDate, dtStartFrom);
                            continue;
                        }

                        string pdfUrl = docNode.Attributes["href"].Value;
                        pdfUrl = pdfUrl.StartsWith("http") ? pdfUrl : "http://www.algonac-mi.gov" + pdfUrl;
                        Documents localdoc = docs.FirstOrDefault(t => t.DocSource == pdfUrl);

                        if (localdoc == null)
                        {
                            localdoc              = new Documents();
                            localdoc.DocId        = Guid.NewGuid().ToString();
                            localdoc.DocType      = category;
                            localdoc.CityId       = this.cityEntity.CityId;
                            localdoc.DocSource    = pdfUrl;
                            localdoc.DocLocalPath = string.Format("{0}\\{1}_{2}_{3}.pdf", this.localDirectory, category, meetingDate.ToString("yyyy-MM-dd"), Guid.NewGuid().ToString());

                            try
                            {
                                c.DownloadFile(pdfUrl, localdoc.DocLocalPath);
                            }
                            catch
                            {
                            }

                            docs.Add(localdoc);
                        }
                        else
                        {
                            Console.ForegroundColor = ConsoleColor.Yellow;
                            Console.WriteLine("This document already downloaded...");
                            Console.ResetColor();
                        }

                        this.ReadText(false, localdoc.DocLocalPath, ref localdoc);
                        QueryResult qr = queries.FirstOrDefault(t => t.DocId == localdoc.DocId);

                        if (qr == null)
                        {
                            qr             = new QueryResult();
                            qr.DocId       = localdoc.DocId;
                            qr.CityId      = localdoc.CityId;
                            qr.MeetingDate = meetingDate;

                            qr.SearchTime = DateTime.Now;
                            queries.Add(qr);
                        }

                        this.ExtractQueriesFromDoc(localdoc, ref qr);
                        Console.WriteLine("{0} documents saved...", docs.Count);
                        Console.WriteLine("{0} query results saved...", queries.Count);
                    }
                }

                this.SaveMeetingResultsToSQL(docs, queries);
            }
        }
Пример #28
0
        public void DownloadCouncilPdfFiles()
        {
            HtmlWeb            web     = new HtmlWeb();
            WebClient          c       = new WebClient();
            List <Documents>   docs    = this.LoadDocumentsDoneSQL();
            List <QueryResult> queries = this.LoadQueriesDoneSQL();

            this.listNoFiles = File.Exists(string.Format("{0}_NoFiles.txt", this.GetType().Name)) ?
                               File.ReadAllLines(string.Format("{0}_NoFiles.txt", this.GetType().Name)).Select(t => int.Parse(t)).ToList() :
                               new List <int>();

            this.ExtractPlanningCommissionFiles(ref docs, ref queries);
            this.DownloadZBAFiles(ref docs, ref queries);
            this.SaveMeetingResultsToSQL(docs, queries);

            string earlistUrl = string.Empty;
            string latestUrl  = string.Empty;

            this.GetEarliestMeeting(this.cityCouncilUrl, ref earlistUrl, ref latestUrl);
            string councilTemplate = this.docUrls.FirstOrDefault(t => t.StartsWith("City Council"));
            string template        = councilTemplate.Split('"').FirstOrDefault().Split('*').LastOrDefault();
            Regex  digitReg        = new Regex("[0-9]+");

            //if (string.IsNullOrWhiteSpace(latestUrl))
            //{
            //    Console.WriteLine("No meeting on page...");
            //    return;
            //}

            int end = int.Parse(digitReg.Matches(latestUrl).Cast <Match>().ToList().LastOrDefault().ToString());

            if (end > this.largest)
            {
                this.largest = end;
            }
            listNoFiles.RemoveAll(t => t > this.largest);
            int start = int.Parse(councilTemplate.Split('"').LastOrDefault());

            Console.WriteLine("Start from {0}...", start);
            Console.WriteLine("End at {0}...", this.largest);

            for (int i = start; i <= this.largest; i++)
            {
                if (listNoFiles.Contains(i))
                {
                    continue;
                }

                string             councilUrl     = template.Contains("{0}") ? string.Format(template, i) : template;
                HtmlDocument       doc            = web.Load(councilUrl);
                HtmlNodeCollection linksNodes     = doc.DocumentNode.SelectNodes("//a[@href]");
                List <HtmlNode>    fileLinksNodes = linksNodes != null?
                                                    linksNodes.Where(t =>
                                                                     t.Attributes["href"].Value.ToLower().Contains(".pdf") ||
                                                                     t.Attributes["href"].Value.ToLower().Contains(".doc") ||
                                                                     t.Attributes["href"].Value.ToLower().Contains("fileticket"))
                                                    .ToList() : null;

                HtmlNode dateNode = doc.DocumentNode.SelectSingleNode("//*[text()='Start Date/Time:']/parent::div");
                dateNode = dateNode == null ? null : dateNode.NextSibling.NextSibling;
                string date = dateNode == null ? string.Empty : DateTime.Parse(dateNode.InnerText).ToString("yyyy-MM-dd");

                if (fileLinksNodes != null && fileLinksNodes.Count > 0)
                {
                    foreach (HtmlNode fileNode in fileLinksNodes)
                    {
                        string pdfUrl = fileNode.Attributes["href"].Value;
                        pdfUrl = pdfUrl.StartsWith("http") ? pdfUrl : "http://www.detroitmi.gov" + fileNode.Attributes["href"].Value;
                        Documents localDoc = docs.FirstOrDefault(t => t.DocSource.Contains(pdfUrl));

                        if (localDoc == null)
                        {
                            localDoc       = new Documents();
                            localDoc.DocId = Guid.NewGuid().ToString();
                            HtmlNode categoryNode = doc.DocumentNode.SelectSingleNode("//*[text()='Category:']/parent::div");
                            categoryNode = categoryNode != null ? categoryNode.NextSibling.NextSibling : categoryNode;

                            if (categoryNode == null)
                            {
                                continue;
                            }

                            localDoc.DocType = categoryNode == null ? string.Empty : categoryNode.InnerText.Trim('\t', '\n', '\r');
                            string localFile = pdfUrl.ToLower().Contains("fileticket") ?
                                               pdfUrl.Split('&').FirstOrDefault().Split('=').LastOrDefault().Replace("%", string.Empty) :
                                               pdfUrl.Split('?').FirstOrDefault().Split('/').LastOrDefault();
                            string localFileFull = pdfUrl.ToLower().Contains("fileticket") ?
                                                   string.Format("{0}\\Council_{1}_{2}.pdf", this.localDirectory, localFile, date) :
                                                   string.Format("{0}\\{1}", localDirectory, localFile);

                            if (File.Exists(localFileFull))
                            {
                                localFileFull = string.Format("{0}\\{1}_{2}.pdf", localDirectory, Path.GetFileNameWithoutExtension(localFile), date);
                            }

                            localDoc.DocLocalPath = localFileFull;
                            localDoc.CityId       = this.cityEntity.CityId;
                            localDoc.DocSource    = pdfUrl;

                            try
                            {
                                c.DownloadFile(pdfUrl, localFileFull);
                            }
                            catch (Exception)
                            {
                                continue;
                            }

                            docs.Add(localDoc);
                        }

                        this.ReadText(true, localDoc.DocLocalPath, ref localDoc);
                        Console.ForegroundColor = ConsoleColor.Yellow;
                        Console.WriteLine("Now search in doc {0}.", localDoc.DocLocalPath);
                        Console.ResetColor();
                        QueryResult qr = queries.FirstOrDefault(q => q.DocId == localDoc.DocId);

                        if (qr == null)
                        {
                            qr = this.ExtractMeetingInformation(doc, localDoc);
                            queries.Add(qr);
                        }
                        else
                        {
                            this.ExtractQueriesFromDoc(localDoc, ref qr);
                        }

                        Console.WriteLine("{0} queries added, {1} docs added...", queries.Count, docs.Count);
                    }
                }
                else
                {
                    listNoFiles.Add(i);
                    File.WriteAllLines(string.Format("{0}_NoFiles.txt", this.GetType().Name), listNoFiles.Select(t => t.ToString()));
                    Console.WriteLine("No files on {0}...", councilUrl);
                }
            }

            //this.docUrls.Remove(councilTemplate);
            //this.docUrls.Add(string.Format("City Council*{0}\"{1}", template, council_current));
            //File.WriteAllLines("DetroitMICity_Urls.txt", this.docUrls, Encoding.UTF8);
            this.SaveMeetingResultsToSQL(docs, queries);
        }
Пример #29
0
        public void DownloadCouncilPdfFiles()
        {
            List <Documents>   docs    = this.LoadDocumentsDoneSQL();
            List <QueryResult> queries = this.LoadQueriesDoneSQL();
            WebClient          c       = new WebClient();
            HtmlWeb            web     = new HtmlWeb();
            Regex dateReg = new Regex("[0-9]{1,2}\\/[0-9]{1,2}\\/[0-9]{4}");

            HtmlDocument       doc       = web.Load(this.docUrls.FirstOrDefault());
            HtmlNodeCollection fileNodes = doc.DocumentNode.SelectNodes("//*[text()='Township Board']/ancestor::table/tbody/tr[position()>2]");

            if (fileNodes != null)
            {
                foreach (HtmlNode fileNode in fileNodes)
                {
                    HtmlNodeCollection tds = fileNode.SelectNodes("./td");

                    for (int i = 0; i < 6; i++)
                    {
                        string category = string.Empty;

                        if (i == 0 || i == 1)
                        {
                            category = "City Council";
                        }
                        else if (i == 2 || i == 3)
                        {
                            category = "Planning Commission";
                        }
                        else if (i == 4 || i == 5)
                        {
                            category = "Zoning Board of Appeals";
                        }

                        HtmlNode           currentNode = tds[i];
                        HtmlNodeCollection docNodes    = currentNode.SelectNodes("./a");

                        if (docNodes != null)
                        {
                            foreach (HtmlNode docNode in docNodes)
                            {
                                string meetingDocUrl   = this.cityEntity.CityUrl + "/" + docNode.Attributes["href"].Value;
                                string meetingDateText = dateReg.Match(docNode.InnerText).ToString();

                                if (string.IsNullOrEmpty(meetingDateText))
                                {
                                    Console.WriteLine("No file...");
                                    continue;
                                }

                                DateTime meetingDate = DateTime.Parse(meetingDateText);

                                if (meetingDate < this.dtStartFrom)
                                {
                                    Console.WriteLine("Too early...");
                                    continue;
                                }

                                Documents localdoc = docs.FirstOrDefault(t => t.DocSource == meetingDocUrl);

                                if (localdoc == null)
                                {
                                    localdoc              = new Documents();
                                    localdoc.DocId        = Guid.NewGuid().ToString();
                                    localdoc.CityId       = this.cityEntity.CityId;
                                    localdoc.Important    = false;
                                    localdoc.Checked      = false;
                                    localdoc.DocSource    = meetingDocUrl;
                                    localdoc.DocType      = category;
                                    localdoc.DocLocalPath = string.Format("{0}\\{1}",
                                                                          this.localDirectory,
                                                                          meetingDocUrl.Split('/').LastOrDefault());

                                    try
                                    {
                                        c.DownloadFile(meetingDocUrl, localdoc.DocLocalPath);
                                    }
                                    catch (Exception ex) { }

                                    docs.Add(localdoc);
                                }
                                else
                                {
                                    Console.WriteLine("This file already downloaded...");
                                }

                                this.ReadText(false, localdoc.DocLocalPath, ref localdoc);
                                QueryResult qr = queries.FirstOrDefault(t => t.DocId == localdoc.DocId);

                                if (qr == null)
                                {
                                    qr             = new QueryResult();
                                    qr.CityId      = this.cityEntity.CityId;
                                    qr.DocId       = localdoc.DocId;
                                    qr.SearchTime  = DateTime.Now;
                                    qr.MeetingDate = meetingDate;
                                    queries.Add(qr);
                                }

                                this.ExtractQueriesFromDoc(localdoc, ref qr);
                                Console.WriteLine("{0} docs added, {1} queries added...", docs.Count, queries.Count);
                                this.SaveMeetingResultsToSQL(docs, queries);
                            }
                        }
                    }
                }
            }
        }
Пример #30
0
        public void ExtractOthers(string url, string category, ref List <Documents> docs, ref List <QueryResult> queries)
        {
            HtmlWeb            web         = new HtmlWeb();
            WebClient          c           = new WebClient();
            Regex              dateReg     = new Regex("[a-zA-z]+[\\s]{0,2}[0-9]+,[\\s]{0,2}[0-9]+");
            HtmlDocument       listDoc     = web.Load(url);
            HtmlNodeCollection docNodeList = listDoc.DocumentNode.SelectNodes("//a[contains(@href,'pdf')]");

            if (docNodeList != null)
            {
                foreach (HtmlNode docNode in docNodeList)
                {
                    string docUrl = docNode.Attributes["href"].Value;
                    docUrl = docUrl.StartsWith("http") ? docUrl : this.cityEntity.CityUrl + docUrl;
                    string meetingDateText = docNode.InnerText.Trim('\r', '\n', '\t', (char)32, (char)160);
                    meetingDateText = dateReg.Match(meetingDateText).ToString();
                    DateTime meetingDate = DateTime.Parse(meetingDateText);

                    if (meetingDate < this.dtStartFrom)
                    {
                        Console.WriteLine("Too early, skip!");
                        continue;
                    }

                    Documents localdoc = docs.FirstOrDefault(t => t.DocSource == docUrl);

                    if (localdoc == null)
                    {
                        localdoc              = new Documents();
                        localdoc.DocId        = Guid.NewGuid().ToString();
                        localdoc.CityId       = this.cityEntity.CityId;
                        localdoc.Checked      = false;
                        localdoc.Important    = false;
                        localdoc.DocSource    = docUrl;
                        localdoc.DocType      = category;
                        localdoc.DocLocalPath = string.Format("{0}\\{1}", this.localDirectory, docUrl.Split('?').FirstOrDefault().Split('/').LastOrDefault());

                        try
                        {
                            c.DownloadFile(docUrl, localdoc.DocLocalPath);
                        }
                        catch
                        {
                        }

                        docs.Add(localdoc);
                    }
                    else
                    {
                        Console.ForegroundColor = ConsoleColor.Yellow;
                        Console.WriteLine("This document already downloaded...");
                        Console.ResetColor();
                    }

                    this.ReadText(false, localdoc.DocLocalPath, ref localdoc);
                    QueryResult qr = queries.FirstOrDefault(t => t.DocId == localdoc.DocId);

                    if (qr == null)
                    {
                        qr            = new QueryResult();
                        qr.CityId     = localdoc.CityId;
                        qr.DocId      = localdoc.DocId;
                        qr.SearchTime = DateTime.Now;

                        qr.MeetingDate = meetingDate;
                        queries.Add(qr);
                    }

                    this.ExtractQueriesFromDoc(localdoc, ref qr);
                    Console.WriteLine("{0} docs added, {1} queries added...", docs.Count, queries.Count);
                }
            }
        }