Example #1
0
        /// <summary>
        /// Get the Title content from the detail page (/arsDetail/?title=1)
        /// </summary>
        /// <param name="content"></param>
        /// <returns></returns>
        public LegTitle ParseTitleContent(string content)
        {
            var   title        = new LegTitle();
            Regex titlePattern = new Regex(@"Title (\d{1,2}) - ((\w*\s|\w*)*)");
            var   match        = titlePattern.Match(content);

            if (match.Success)
            {
                title.Title    = Int32.Parse(match.Groups[1].Value);
                title.Heading  = match.Groups[2].Value;
                title.UrlAzLeg = $@"https://www.azleg.gov/arsDetail?title={title.Title}";
            }

            return(title);
        }
Example #2
0
        static void Main(string[] args)
        {
            try
            {
                using (var context = new AzLegContext())
                {
                    //add two Titles that have been repealed
                    var title2 = new LegTitle();
                    title2.Title    = 2;
                    title2.UrlAzLeg = "https://www.azleg.gov/arsDetail/?title=2";
                    var title24 = new LegTitle();
                    title24.Title    = 24;
                    title24.UrlAzLeg = "https://www.azleg.gov/arsDetail/?title=24";
                    context.LegTitles.AddRange(new LegTitle[] { title2, title24 });

                    //base address used in scraper's httpclient
                    var scraper = new Scraper("https://www.azleg.gov/");

                    //all  the file names of the title page html files
                    var fileList = Directory.GetFiles(".\\Scratch\\TitlePages");

                    foreach (string url in fileList)
                    {
                        string fileContents = File.ReadAllText(url);

                        //parse for Title
                        LegTitle legTitle = scraper.ParseTitleContent(fileContents);
                        //Get title back from db so you have the DB PK for the FKs in Chapters
                        LegTitle titleFromDB = context.LegTitles.Where(x => x.Title == legTitle.Title).FirstOrDefault();

                        //get title chapters
                        var legChapterList = scraper.ParseChapterContent(fileContents, titleFromDB.Id);
                        //context.LegChapters.AddRange(legChapterList);



                        //context.SaveChanges();

                        Console.WriteLine($"finished processing title content in file {url}");
                    }
                }
            }
            catch (Exception exception)
            {
                Console.WriteLine("\tERROR: " + exception.Message);
            }
        }
Example #3
0
        public void ScrapeAndDump()
        {
            using (var context = new AzLegContext())
            {
                //base address used in scraper's httpclient
                var scraper = new Scraper("https://www.azleg.gov/");

                //all  the file names of the title page html files
                var fileList = Directory.GetFiles(".\\Scratch\\TitlePages");

                foreach (string url in fileList)
                {
                    string fileContents = File.ReadAllText(url);

                    LegTitle legTitle = scraper.ParseTitleContent(fileContents);

                    //add title to db
                    context.LegTitles.Add(legTitle);

                    LegTitle titleFromDB = context.LegTitles.Where(x => x.Title == legTitle.Title).FirstOrDefault();

                    //get title chapters
                    var legChapterList = scraper.ParseChapterContent(fileContents, titleFromDB.Id);

                    context.LegChapters.AddRange(legChapterList);

                    //add Chapter Articles

                    Console.WriteLine($"finished processing title content in file {url}");
                }

                //add two Titles that have been repealed
                //var title2 = new LegTitle();
                //title2.Title = 2;
                //title2.UrlAzLeg = "https://www.azleg.gov/arsDetail/?title=2";
                //var title24 = new LegTitle();
                //title24.Title = 24;
                //title24.UrlAzLeg = "https://www.azleg.gov/arsDetail/?title=24";
                //context.LegTitles.AddRange(new LegTitle[] { title2, title24 });

                context.SaveChanges();
            }
        }