private static Uri CreateUriWithQueryString(FormSearch formSearch) { //http://media.ethics.ga.gov/search/Campaign/Campaign_OfficeSearchResults.aspx? //ElectionYear=2018&County=&City=&OfficeTypeID=120&District=&Division=&FilerID=&OfficeName=State%20Senate&Circuit= var sb = new StringBuilder(OfficeSearchResultsUrl); sb.Append("?ElectionYear="); sb.Append(formSearch.ElectionYear); sb.Append("&County="); sb.Append(formSearch.County); sb.Append("&City="); sb.Append(formSearch.City); sb.Append("&OfficeTypeID="); sb.Append(formSearch.OfficeTypeId); sb.Append("&District="); sb.Append(formSearch.District); sb.Append("&Division="); sb.Append(formSearch.Division); sb.Append("&FilerID="); sb.Append(formSearch.FilerId); sb.Append("&OfficeName="); sb.Append(formSearch.OfficeName.Replace(" ", "%20")); sb.Append("&Circuit="); sb.Append(formSearch.Circuit); var url = System.Web.HttpUtility.UrlPathEncode(sb.ToString()); return(new Uri(url)); }
public static bool ReadSubsequentPage(FormSearch formSearch) { var pageNumber = CurrentStatus.LastPageCompleted + 1; var officeNameAndId = $"Office {formSearch.OfficeName}-{formSearch.OfficeTypeId}, Pg {pageNumber}"; var contentString = PostIt(CurrentStatus.TheUri, pageNumber).Result; if (!_httpRespMsg.IsSuccessStatusCode) { CurrentStatus.LastOpMessage = $"ReadSubsequentPage {officeNameAndId} call returned Status Code: {_httpRespMsg.StatusCode}"; CurrentStatus.ScrapeComplete = true; CurrentStatus.LastPageCompleted++; return(false); } if (string.IsNullOrEmpty(contentString)) { CurrentStatus.LastOpMessage = $"ReadSubsequentPage {officeNameAndId} received null content"; CurrentStatus.ScrapeComplete = true; CurrentStatus.LastPageCompleted++; return(false); } //CurrentStatus.LastOpMessage = "ReadSubsequentPage received document length = " + contentString.Length; var pipeData = contentString.Split('|'); StorePostData(pipeData); const string tgtTable = "/div/div/table/tr"; var htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(pipeData[3]); var nodes = htmlDoc.DocumentNode.SelectNodes(tgtTable); if (nodes == null) { CurrentStatus.ScrapeComplete = true; CurrentStatus.LastOpMessage = $"ReadSubsequentPage {officeNameAndId} Data table search returned null nodes."; CurrentStatus.LastPageCompleted++; return(false); } var rows = ProcessTable(formSearch, nodes, pageNumber); //CurrentStatus.LastOpMessage = $"ReadSubsequentPage read page {pageNumber} with candidate count " + (rows - 2); CurrentStatus.TotalCandidates += (rows - 2); CurrentStatus.LastPageCompleted++; if (CurrentStatus.TotalPages == pageNumber) { CurrentStatus.ScrapeComplete = true; return(false); // Last page, don't continue } // More pages to do. return(true); }
private bool RunQuery(FormSearch search) { SeqStatus.TheFormSearch = search; if (!UpdateCandidates.ReadFirstPage(search)) { // Don't continue, say why switch (UpdateCandidates.CurrentStatus.TotalPages) { case -2: // Problem with internet connection SeqStatus.LastOpMessage = $"RunQuery: Fail in first page search for {search.OfficeName}, officeTypeId: {search.OfficeTypeId}: {UpdateCandidates.CurrentStatus.LastOpMessage}"; SeqStatus.SequenceFail = true; return(false); case -1: // Problem with search: Could not retrieve URL, null content SeqStatus.LastOpMessage = $"RunQuery: Fail in first page search for {search.OfficeName}, officeTypeId: {search.OfficeTypeId}: {UpdateCandidates.CurrentStatus.LastOpMessage}"; break; case 0: // No candidates found in category SeqStatus.LastOpMessage = $"RunQuery: No candidates found for {search.OfficeName}, officeTypeId: {search.OfficeTypeId}."; break; case 1: // Only one page of results SeqStatus.LastOpMessage = $"RunQuery: Found {UpdateCandidates.Candidates.Count} for {search.OfficeName}, officeTypeId: {search.OfficeTypeId}."; break; default: SeqStatus.LastOpMessage = $"RunQuery: ReadFirstPage said don't continue for {search.OfficeName}, officeTypeId: {search.OfficeTypeId}, PageCount: {UpdateCandidates.CurrentStatus.TotalPages}: Should never get here!"; break; } } while (UpdateCandidates.CurrentStatus.LastPageCompleted < UpdateCandidates.CurrentStatus.TotalPages) { // SeqStatus.LastOpMessage = $"RunQuery: Reading subsequent page {pageCounter++} for {search.OfficeName}, officeTypeId: {search.OfficeTypeId}."; var finished = UpdateCandidates.ReadSubsequentPage(search); } var candidates = UpdateCandidates.Candidates; //SeqStatus.LastOpMessage = // $"RunQuery: Finished query for {search.OfficeName}, officeTypeId: {search.OfficeTypeId}, Candidate Count: {UpdateCandidates.CurrentStatus.TotalCandidates}"; return(true); }
public static bool ReadFirstPage(FormSearch formSearch) { const string tgtNoResults = "//*[@id=\"ctl00_ContentPlaceHolder1_lblMessage\"]"; const string tgtTable = "/html/body/form/table/tr[2]/td/table/tr/td[2]/div/div/div/table/tr"; const string tgtFooter = "//*[@id=\"ctl00_ContentPlaceHolder1_pSection\"]"; const string tgtHiddenFields = "/html/body/form/input"; var officeNameAndId = $"Office {formSearch.OfficeName}-{formSearch.OfficeTypeId}"; ResetStatus(true, true); // sets TotalPages = -1 CurrentStatus.TheUri = CreateUriWithQueryString(formSearch); CurrentStatus.Url = CurrentStatus.TheUri.OriginalString; try { _httpRespMsg = GetSearchPage(CurrentStatus.TheUri, HttpMethod.Get).Result; Log.Debug(ListHeaderValues()); _aspnetSessionId = GetSessionId(_httpRespMsg); Log.Debug($"ASPNET_SessionId = {_aspnetSessionId}"); } catch (Exception ex) { CurrentStatus.LastOpMessage = $"ReadFirstPage {officeNameAndId} exception calling GetSearchPage: {Utils.ExceptionInfo(ex)}"; CurrentStatus.ScrapeComplete = true; CurrentStatus.TotalPages = -2; return(false); } if (!_httpRespMsg.IsSuccessStatusCode) { CurrentStatus.LastOpMessage = $"ReadFirstPage {officeNameAndId} could not retrieve URL, StatusCode: {_httpRespMsg.StatusCode}"; CurrentStatus.ScrapeComplete = true; CurrentStatus.TotalPages = -1; return(false); } var contentString = _httpRespMsg.Content.ReadAsStringAsync().Result; BytesReceived += contentString.Length; if (string.IsNullOrEmpty(contentString)) { CurrentStatus.LastOpMessage = "ReadFirstPage {officeNameAndId} received null content"; CurrentStatus.ScrapeComplete = true; CurrentStatus.TotalPages = -1; return(false); } // CurrentStatus.LastOpMessage = "ReadFirstPage received document length = " + contentString.Length; var htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(contentString); // Check for "Search Returned No Results" var noResultsNodes = htmlDoc.DocumentNode.SelectNodes(tgtNoResults); if (noResultsNodes != null) { if (noResultsNodes[0].InnerHtml.Contains("Search Returned No Results.")) { CurrentStatus.LastOpMessage = $"Search Returned No Candidates for {officeNameAndId}"; CurrentStatus.TotalPages = 0; CurrentStatus.TotalCandidates = 0; CurrentStatus.ScrapeComplete = true; return(false); } } // Get Hidden Input fields from the HTML Head var headNodes = htmlDoc.DocumentNode.SelectNodes(tgtHiddenFields); StoreHidden(headNodes); // if you use HtmlWeb.Load(URL), run StoreHidden twice from two divs at this level // CurrentStatus.LastOpMessage = "StoreHidden: Arguments Added: " + HiddenArguments.Count; // PrintKeysAndValues(HiddenArguments); // Get page info from the footer var footerNodes = htmlDoc.DocumentNode.SelectNodes(tgtFooter); if (footerNodes == null) { CurrentStatus.ScrapeComplete = true; CurrentStatus.LastOpMessage = $"ReadFirstPage: FooterNodes search returned null for {officeNameAndId}"; return(false); } var footerNodesIh = footerNodes[0].InnerHtml; var pageAndCount = ScrapHelp.GetCurrentPageAndCount(footerNodesIh); // CurrentStatus.LastOpMessage = "Page: " + pageAndCount.Item1 + ", PageCount: " + pageAndCount.Item2; CurrentStatus.TotalPages = pageAndCount.Item2; var hasPageSelectorRow = pageAndCount.Item2 > 1; // If there is only a single page there is no Page selector row CurrentStatus.LastOpMessage = $"ReadFirstPage: Page count for {officeNameAndId} = {pageAndCount.Item2}."; // Get candidate data from the table var nodes = htmlDoc.DocumentNode.SelectNodes(tgtTable); if (nodes == null) { CurrentStatus.ScrapeComplete = true; CurrentStatus.LastOpMessage = $"ReadFirstPage: Data table search returned null for {officeNameAndId}."; return(false); } var htmlDocTh = new HtmlDocument(); htmlDocTh.LoadHtml(nodes[0].InnerHtml); var htmlBodyTh = htmlDocTh.DocumentNode.SelectNodes("//th"); // Check if the column headers have changed const string thNames = "ActionCandidate NameElection YearDOI Filed"; var checkStr = htmlBodyTh.Aggregate(string.Empty, (current, th) => current + th.InnerText); // was InnerHtml for HtmlWeb.Load if (thNames != checkStr) { CurrentStatus.ScrapeComplete = true; CurrentStatus.LastOpMessage = $"ReadFirstPage: {officeNameAndId}, Table header mismatch, should be: {thNames} but is: {checkStr}"; return(false); } // Go through the table nodes CurrentStatus.LastOpMessage = $"ReadFirstPage: Calling ProcessTable for {officeNameAndId}, nodes.count = {nodes.Count}, hasPageSelectorRow = {hasPageSelectorRow}."; var rows = ProcessTable(formSearch, nodes, 1, hasPageSelectorRow); CurrentStatus.LastOpMessage = $"ReadFirstPage read page 1 for {officeNameAndId} with candidate count {rows}"; CurrentStatus.LastPageCompleted = 1; if (CurrentStatus.TotalPages == 1) { CurrentStatus.ScrapeComplete = true; return(false); // Last page, don't continue } // More pages to do. return(true); }
/// <summary> /// Returns number of candidate rows processed /// </summary> /// <param name="formSearch">FormSearch object to set querystring</param> /// <param name="nodes">The collection of HTML nodes</param> /// <param name="pageNumber">The current page number of the query results</param> /// <param name="hasPageSelectorRow"></param> /// <returns></returns> private static int ProcessTable(FormSearch formSearch, HtmlNodeCollection nodes, int pageNumber, bool hasPageSelectorRow = true) { var rowIndex = 0; // Start with first candidate data row var lastCandidateRowIdx = nodes.Count - (hasPageSelectorRow ? 2 : 1); foreach (var node in nodes) { if (rowIndex == 0) { rowIndex++; continue; } // Already checked headers if (rowIndex == lastCandidateRowIdx + 1) { // never gets here if no hasPageSelectorRow: ie single page return(lastCandidateRowIdx); } // Last row not needed var candidateNode = node.InnerHtml; var nameIdIdx = candidateNode.IndexOf("NameID=", StringComparison.Ordinal) + 7; // .....&NameID=26758&FilerID=C2017000427&Type=candidate var truncatedInner = candidateNode.Substring(nameIdIdx); // 26758&FilerID=C2017000427&Type=candidate..... var nameIdEndIdx = truncatedInner.IndexOf("&", StringComparison.Ordinal) - 1; // 26758 &FilerID=C2017000427&Type=candidate var filerIdStIdx = truncatedInner.IndexOf("FilerID=", StringComparison.Ordinal) + 8; // 26758&FilerID= C2017000427&Type=candidate var filerIdEndIdx = truncatedInner.IndexOf("Type=", StringComparison.Ordinal) - 5; // 26758&FilerID=C2017000427 &Type=candidate var nameId = truncatedInner.Substring(0, nameIdEndIdx + 1); var filerId = truncatedInner.Substring(filerIdStIdx, filerIdEndIdx - filerIdStIdx); var candidate = new Candidate { NameId = nameId, FilerId = filerId, OfficeTypeId = formSearch.OfficeTypeId, OfficeName = formSearch.OfficeName }; var last3TdStr = truncatedInner.Substring(truncatedInner.IndexOf("<td", StringComparison.Ordinal)); var htmlDocTr = new HtmlDocument(); htmlDocTr.LoadHtml("<tr>" + last3TdStr.Trim() + "</tr"); var trBody = htmlDocTr.DocumentNode.SelectNodes("//tr"); var tdCounter = 0; foreach (var td in trBody.Descendants()) { var tdText = td.InnerText.Trim(); // Was idx 0,5,10 for HtmlWeb.Load switch (tdCounter++) { case 3: candidate.CandidateName = tdText; break; case 7: var year = 0; if (!Int32.TryParse(tdText, out year)) { candidate.Notes += "Empty Election Year."; if (tdText != string.Empty) { CurrentStatus.LastOpMessage = $"Non-int year text: {tdText} for candidate {candidate.CandidateName}."; } } candidate.Year = year; break; case 13: case 16: if (!DateTime.TryParse(tdText, out var dateDoiFiled)) { candidate.Notes += "Empty DOI date."; if (tdText != string.Empty) { CurrentStatus.LastOpMessage = $"Non-Date DOI text: {tdText} for candidate {candidate.CandidateName}."; } } candidate.DoiFiled = dateDoiFiled; break; default: // Log.Info($"Process table encountered unknown tdCounter value: {tdCounter}"); break; } } rowIndex++; // Get the additional data: office district, affiliation, status AdditionalInfo.ReadThePage(candidate); Candidates.Add(candidate); } return(hasPageSelectorRow ? rowIndex - 1 : rowIndex - 2); }