public async Task <List <MonthlyCategoryExpense> > GetMonthCategoryExpenses(MonthlyCategory category, Uri paginatedLink = null, bool autoPaginate = true) { // @todo compare category agency, year, month against last search value and auto-search again if needed using (var http = new HttpClient(_handler, false)) { var url = (paginatedLink != null) ? paginatedLink : category.CategoryUrl; var response = await http.GetStringAsync(url); var doc = new HtmlDocument(); doc.LoadHtml(response); _lastWebFormsValues = ParseWebForms(doc); var paginationNode = doc.DocumentNode.SelectSingleNode("//p[ contains( @class, 'paginationNote' ) ]"); var paginationRegex = new Regex(@"Displaying records (\d+) through (\d+) of (\d+) found."); var nextNode = doc.DocumentNode.SelectSingleNode("//div[ contains( @class, 'nextBack' ) ]/a[ contains( text(), 'Next' ) ]"); var expenseRows = doc.DocumentNode.SelectNodes( "//table[ @id='ctl00_ContentPlaceHolder_ObjectDataControl_CategoryDataTable' ]//tr[ position() > 1 ]"); var expenses = new List <MonthlyCategoryExpense>(); foreach (var expenseRow in expenseRows) { var linkNode = expenseRow.SelectSingleNode(".//a"); var expenseName = linkNode.InnerText; var expenseLink = linkNode.GetAttributeValue("href", ""); var amount = expenseRow.SelectSingleNode("./td[ contains( @class, 'total_highlight' ) ]").InnerText; var expense = new MonthlyCategoryExpense() { Expense = expenseName, ExpenseUrl = new Uri(new Uri("https://applications.sc.gov/SpendingTransparency/"), expenseLink), Amount = Convert.ToDecimal(amount.Replace("$", "").Replace(",", "")), Agency = category.Agency, Year = category.Year, Month = category.Month, Category = category }; expenses.Add(expense); } // if there's another page of expenses, add those recursively - only if we haven't disabled paginating if (nextNode != null && autoPaginate) { var nextLink = new Uri(new Uri("https://applications.sc.gov/SpendingTransparency/"), nextNode.GetAttributeValue("href", "")); expenses.AddRange(await GetMonthCategoryExpenses(category, nextLink)); } return(expenses); } }
public async Task <List <MonthlyCategoryExpensePayment> > GetMonthCategoryExpensePayments(MonthlyCategoryExpense expense) { using (var http = new HttpClient(_handler, false)) { // we have to first fetch the initial page to get webforms values var getResponse = await http.GetAsync(expense.ExpenseUrl); getResponse.EnsureSuccessStatusCode(); var getResponseContent = await getResponse.Content.ReadAsStringAsync(); var doc = new HtmlDocument(); doc.LoadHtml(getResponseContent); _lastWebFormsValues = ParseWebForms(doc); // then we can add those into the POST request and get everything as a CSV var postValues = AddWebForms(new List <KeyValuePair <string, string> >() { new KeyValuePair <string, string>("ctl00$ContentPlaceHolder$PayeeControl$SortDropDownList", "0"), new KeyValuePair <string, string>("ctl00$ContentPlaceHolder$PayeeControl$ExportButton", "Download .CSV file"), }); var postBody = new FormUrlEncodedContent(postValues); var response = await http.PostAsync(expense.ExpenseUrl, postBody); response.EnsureSuccessStatusCode(); var responseBody = await response.Content.ReadAsStringAsync(); // trim the first 6 lines, they are header crap var lines = responseBody.Split(new [] { '\n', '\r' }, StringSplitOptions.RemoveEmptyEntries); var csvContents = String.Join("\r\n", lines.Reverse().Take(lines.Length - 6).Reverse()); // there is also a case where a record ends up looking like this: // MOTOROLA INC,3406569007,6/1/2015,Earmarked,"""HIGHWAY PATROL FEES,FINES, & ASSESSME,$88151.6500 // this is the only case where i've seen double quotes, so we're just going to blindly replace them all, since textual fields aren't quoted anyway csvContents = csvContents.Replace("\"\"\"", ""); using (var stream = new StringReader(csvContents)) { try { var csv = new CsvReader(stream); csv.Configuration.TrimOptions = TrimOptions.Trim | TrimOptions.InsideQuotes; csv.Configuration.RegisterClassMap <ExpensePaymentClassMap>(); // looks like sometimes subfund can be "SubFund Title" and others "SubFund_Title" - fix that csv.Configuration.PrepareHeaderForMatch = header => header.Replace("_", " "); return(csv.GetRecords <MonthlyCategoryExpensePayment>().ToList()); } catch (Exception e) { throw new Exception("Unable to parse expenses from CSV!", e); } } } }
private async Task GetSearchValues() { using (var http = new HttpClient(_handler, false)) { var response = await http.GetStringAsync("https://applications.sc.gov/SpendingTransparency/MonthlyExpenditureSearch.aspx?etype=1"); var doc = new HtmlDocument(); doc.LoadHtml(response); _lastWebFormsValues = ParseWebForms(doc); var yearsOptions = doc.DocumentNode.SelectNodes( "//select[ @id='ctl00_ContentPlaceHolder_SearchControl_YearDropdownList' ]/option"); var monthsOptions = doc.DocumentNode.SelectNodes( "//select[ @id='ctl00_ContentPlaceHolder_SearchControl_MonthDropdownList' ]/option"); var agencyOptions = doc.DocumentNode.SelectNodes( "//select[ @id='ctl00_ContentPlaceHolder_SearchControl_AgencyDropdownList' ]/option"); var years = yearsOptions.Where(x => x.GetAttributeValue("value", "-1") != "-1").Select(x => new Year() { SearchValue = x.GetAttributeValue("value", "-1"), Text = x.InnerText, }).ToList(); var months = monthsOptions.Where(x => x.GetAttributeValue("value", "-1") != "-1").Select(x => new Month() { SearchValue = x.GetAttributeValue("value", "-1"), Text = x.InnerText, }).ToList(); var agencies = agencyOptions.Where(x => x.GetAttributeValue("value", "-1") != "-1").Select(x => new Agency() { SearchValue = x.GetAttributeValue("value", "-1"), Text = x.InnerText.Replace("&", "&"), }).ToList(); _years = years; _months = months; _agencies = agencies; } }