コード例 #1
0
        private static Uri CreateUriWithQueryString(FormSearch formSearch)
        {
            //http://media.ethics.ga.gov/search/Campaign/Campaign_OfficeSearchResults.aspx?
            //ElectionYear=2018&County=&City=&OfficeTypeID=120&District=&Division=&FilerID=&OfficeName=State%20Senate&Circuit=

            var sb = new StringBuilder(OfficeSearchResultsUrl);

            sb.Append("?ElectionYear=");
            sb.Append(formSearch.ElectionYear);
            sb.Append("&County=");
            sb.Append(formSearch.County);
            sb.Append("&City=");
            sb.Append(formSearch.City);
            sb.Append("&OfficeTypeID=");
            sb.Append(formSearch.OfficeTypeId);
            sb.Append("&District=");
            sb.Append(formSearch.District);
            sb.Append("&Division=");
            sb.Append(formSearch.Division);
            sb.Append("&FilerID=");
            sb.Append(formSearch.FilerId);
            sb.Append("&OfficeName=");
            sb.Append(formSearch.OfficeName.Replace(" ", "%20"));
            sb.Append("&Circuit=");
            sb.Append(formSearch.Circuit);

            var url = System.Web.HttpUtility.UrlPathEncode(sb.ToString());

            return(new Uri(url));
        }
コード例 #2
0
        public static bool ReadSubsequentPage(FormSearch formSearch)
        {
            var pageNumber      = CurrentStatus.LastPageCompleted + 1;
            var officeNameAndId = $"Office {formSearch.OfficeName}-{formSearch.OfficeTypeId}, Pg {pageNumber}";

            var contentString = PostIt(CurrentStatus.TheUri, pageNumber).Result;

            if (!_httpRespMsg.IsSuccessStatusCode)
            {
                CurrentStatus.LastOpMessage  = $"ReadSubsequentPage {officeNameAndId} call returned Status Code: {_httpRespMsg.StatusCode}";
                CurrentStatus.ScrapeComplete = true;
                CurrentStatus.LastPageCompleted++;
                return(false);
            }

            if (string.IsNullOrEmpty(contentString))
            {
                CurrentStatus.LastOpMessage  = $"ReadSubsequentPage {officeNameAndId} received null content";
                CurrentStatus.ScrapeComplete = true;
                CurrentStatus.LastPageCompleted++;
                return(false);
            }

            //CurrentStatus.LastOpMessage = "ReadSubsequentPage received document length = " + contentString.Length;

            var pipeData = contentString.Split('|');

            StorePostData(pipeData);

            const string tgtTable = "/div/div/table/tr";

            var htmlDoc = new HtmlDocument();

            htmlDoc.LoadHtml(pipeData[3]);

            var nodes = htmlDoc.DocumentNode.SelectNodes(tgtTable);

            if (nodes == null)
            {
                CurrentStatus.ScrapeComplete = true;
                CurrentStatus.LastOpMessage  = $"ReadSubsequentPage {officeNameAndId} Data table search returned null nodes.";
                CurrentStatus.LastPageCompleted++;
                return(false);
            }
            var rows = ProcessTable(formSearch, nodes, pageNumber);

            //CurrentStatus.LastOpMessage = $"ReadSubsequentPage read page {pageNumber} with candidate count " + (rows - 2);
            CurrentStatus.TotalCandidates += (rows - 2);
            CurrentStatus.LastPageCompleted++;

            if (CurrentStatus.TotalPages == pageNumber)
            {
                CurrentStatus.ScrapeComplete = true;
                return(false); // Last page, don't continue
            }
            // More pages to do.
            return(true);
        }
コード例 #3
0
        private bool RunQuery(FormSearch search)
        {
            SeqStatus.TheFormSearch = search;

            if (!UpdateCandidates.ReadFirstPage(search))
            {
                // Don't continue, say why

                switch (UpdateCandidates.CurrentStatus.TotalPages)
                {
                case -2:
                    // Problem with internet connection
                    SeqStatus.LastOpMessage =
                        $"RunQuery: Fail in first page search for {search.OfficeName}, officeTypeId: {search.OfficeTypeId}: {UpdateCandidates.CurrentStatus.LastOpMessage}";
                    SeqStatus.SequenceFail = true;
                    return(false);

                case -1:
                    // Problem with search: Could not retrieve URL, null content
                    SeqStatus.LastOpMessage =
                        $"RunQuery: Fail in first page search for {search.OfficeName}, officeTypeId: {search.OfficeTypeId}: {UpdateCandidates.CurrentStatus.LastOpMessage}";
                    break;

                case 0:
                    // No candidates found in category
                    SeqStatus.LastOpMessage =
                        $"RunQuery: No candidates found for {search.OfficeName}, officeTypeId: {search.OfficeTypeId}.";
                    break;

                case 1:
                    // Only one page of results
                    SeqStatus.LastOpMessage =
                        $"RunQuery: Found {UpdateCandidates.Candidates.Count} for {search.OfficeName}, officeTypeId: {search.OfficeTypeId}.";
                    break;

                default:
                    SeqStatus.LastOpMessage =
                        $"RunQuery: ReadFirstPage said don't continue for {search.OfficeName}, officeTypeId: {search.OfficeTypeId}, PageCount: {UpdateCandidates.CurrentStatus.TotalPages}: Should never get here!";
                    break;
                }
            }

            while (UpdateCandidates.CurrentStatus.LastPageCompleted < UpdateCandidates.CurrentStatus.TotalPages)
            {
                // SeqStatus.LastOpMessage = $"RunQuery: Reading subsequent page {pageCounter++} for {search.OfficeName}, officeTypeId: {search.OfficeTypeId}.";
                var finished = UpdateCandidates.ReadSubsequentPage(search);
            }

            var candidates = UpdateCandidates.Candidates;

            //SeqStatus.LastOpMessage =
            //    $"RunQuery: Finished query for {search.OfficeName}, officeTypeId: {search.OfficeTypeId}, Candidate Count: {UpdateCandidates.CurrentStatus.TotalCandidates}";

            return(true);
        }
コード例 #4
0
        public static bool ReadFirstPage(FormSearch formSearch)
        {
            const string tgtNoResults    = "//*[@id=\"ctl00_ContentPlaceHolder1_lblMessage\"]";
            const string tgtTable        = "/html/body/form/table/tr[2]/td/table/tr/td[2]/div/div/div/table/tr";
            const string tgtFooter       = "//*[@id=\"ctl00_ContentPlaceHolder1_pSection\"]";
            const string tgtHiddenFields = "/html/body/form/input";

            var officeNameAndId = $"Office {formSearch.OfficeName}-{formSearch.OfficeTypeId}";

            ResetStatus(true, true);        // sets TotalPages = -1
            CurrentStatus.TheUri = CreateUriWithQueryString(formSearch);
            CurrentStatus.Url    = CurrentStatus.TheUri.OriginalString;

            try
            {
                _httpRespMsg = GetSearchPage(CurrentStatus.TheUri, HttpMethod.Get).Result;
                Log.Debug(ListHeaderValues());
                _aspnetSessionId = GetSessionId(_httpRespMsg);
                Log.Debug($"ASPNET_SessionId = {_aspnetSessionId}");
            }
            catch (Exception ex)
            {
                CurrentStatus.LastOpMessage  = $"ReadFirstPage {officeNameAndId} exception calling GetSearchPage: {Utils.ExceptionInfo(ex)}";
                CurrentStatus.ScrapeComplete = true;
                CurrentStatus.TotalPages     = -2;
                return(false);
            }

            if (!_httpRespMsg.IsSuccessStatusCode)
            {
                CurrentStatus.LastOpMessage  = $"ReadFirstPage {officeNameAndId} could not retrieve URL, StatusCode: {_httpRespMsg.StatusCode}";
                CurrentStatus.ScrapeComplete = true;
                CurrentStatus.TotalPages     = -1;
                return(false);
            }
            var contentString = _httpRespMsg.Content.ReadAsStringAsync().Result;

            BytesReceived += contentString.Length;

            if (string.IsNullOrEmpty(contentString))
            {
                CurrentStatus.LastOpMessage  = "ReadFirstPage {officeNameAndId} received null content";
                CurrentStatus.ScrapeComplete = true;
                CurrentStatus.TotalPages     = -1;
                return(false);
            }

            // CurrentStatus.LastOpMessage = "ReadFirstPage received document length = " + contentString.Length;

            var htmlDoc = new HtmlDocument();

            htmlDoc.LoadHtml(contentString);

            // Check for "Search Returned No Results"

            var noResultsNodes = htmlDoc.DocumentNode.SelectNodes(tgtNoResults);

            if (noResultsNodes != null)
            {
                if (noResultsNodes[0].InnerHtml.Contains("Search Returned No Results."))
                {
                    CurrentStatus.LastOpMessage   = $"Search Returned No Candidates for {officeNameAndId}";
                    CurrentStatus.TotalPages      = 0;
                    CurrentStatus.TotalCandidates = 0;
                    CurrentStatus.ScrapeComplete  = true;
                    return(false);
                }
            }

            // Get Hidden Input fields from the HTML Head

            var headNodes = htmlDoc.DocumentNode.SelectNodes(tgtHiddenFields);

            StoreHidden(headNodes);
            // if you use HtmlWeb.Load(URL), run StoreHidden twice from two divs at this level

            // CurrentStatus.LastOpMessage = "StoreHidden: Arguments Added: " + HiddenArguments.Count;
            // PrintKeysAndValues(HiddenArguments);

            // Get page info from the footer

            var footerNodes = htmlDoc.DocumentNode.SelectNodes(tgtFooter);

            if (footerNodes == null)
            {
                CurrentStatus.ScrapeComplete = true;
                CurrentStatus.LastOpMessage  = $"ReadFirstPage: FooterNodes search returned null for {officeNameAndId}";
                return(false);
            }
            var footerNodesIh = footerNodes[0].InnerHtml;
            var pageAndCount  = ScrapHelp.GetCurrentPageAndCount(footerNodesIh);

            // CurrentStatus.LastOpMessage = "Page: " + pageAndCount.Item1 + ", PageCount: " + pageAndCount.Item2;
            CurrentStatus.TotalPages = pageAndCount.Item2;
            var hasPageSelectorRow = pageAndCount.Item2 > 1;   // If there is only a single page there is no Page selector row

            CurrentStatus.LastOpMessage = $"ReadFirstPage: Page count for {officeNameAndId} = {pageAndCount.Item2}.";

            // Get candidate data from the table

            var nodes = htmlDoc.DocumentNode.SelectNodes(tgtTable);

            if (nodes == null)
            {
                CurrentStatus.ScrapeComplete = true;
                CurrentStatus.LastOpMessage  = $"ReadFirstPage: Data table search returned null for {officeNameAndId}.";
                return(false);
            }

            var htmlDocTh = new HtmlDocument();

            htmlDocTh.LoadHtml(nodes[0].InnerHtml);

            var htmlBodyTh = htmlDocTh.DocumentNode.SelectNodes("//th");

            // Check if the column headers have changed
            const string thNames  = "ActionCandidate NameElection YearDOI Filed";
            var          checkStr = htmlBodyTh.Aggregate(string.Empty, (current, th) => current + th.InnerText); // was InnerHtml for HtmlWeb.Load

            if (thNames != checkStr)
            {
                CurrentStatus.ScrapeComplete = true;
                CurrentStatus.LastOpMessage  = $"ReadFirstPage: {officeNameAndId}, Table header mismatch, should be: {thNames} but is: {checkStr}";
                return(false);
            }

            // Go through the table nodes

            CurrentStatus.LastOpMessage = $"ReadFirstPage: Calling ProcessTable for {officeNameAndId}, nodes.count = {nodes.Count}, hasPageSelectorRow = {hasPageSelectorRow}.";
            var rows = ProcessTable(formSearch, nodes, 1, hasPageSelectorRow);

            CurrentStatus.LastOpMessage     = $"ReadFirstPage read page 1 for {officeNameAndId} with candidate count {rows}";
            CurrentStatus.LastPageCompleted = 1;

            if (CurrentStatus.TotalPages == 1)
            {
                CurrentStatus.ScrapeComplete = true;
                return(false); // Last page, don't continue
            }
            // More pages to do.
            return(true);
        }
コード例 #5
0
        /// <summary>
        /// Returns number of candidate rows processed
        /// </summary>
        /// <param name="formSearch">FormSearch object to set querystring</param>
        /// <param name="nodes">The collection of HTML nodes</param>
        /// <param name="pageNumber">The current page number of the query results</param>
        /// <param name="hasPageSelectorRow"></param>
        /// <returns></returns>
        private static int ProcessTable(FormSearch formSearch, HtmlNodeCollection nodes, int pageNumber, bool hasPageSelectorRow = true)
        {
            var rowIndex            = 0; // Start with first candidate data row
            var lastCandidateRowIdx = nodes.Count - (hasPageSelectorRow ? 2 : 1);

            foreach (var node in nodes)
            {
                if (rowIndex == 0)
                {
                    rowIndex++;
                    continue;
                } // Already checked headers

                if (rowIndex == lastCandidateRowIdx + 1)
                {
                    // never gets here if no hasPageSelectorRow: ie single page
                    return(lastCandidateRowIdx);
                } // Last row not needed

                var candidateNode = node.InnerHtml;
                var nameIdIdx     =
                    candidateNode.IndexOf("NameID=", StringComparison.Ordinal) +
                    7;                                                   // .....&NameID=26758&FilerID=C2017000427&Type=candidate
                var truncatedInner = candidateNode.Substring(nameIdIdx); // 26758&FilerID=C2017000427&Type=candidate.....

                var nameIdEndIdx =
                    truncatedInner.IndexOf("&", StringComparison.Ordinal) - 1; // 26758    &FilerID=C2017000427&Type=candidate
                var filerIdStIdx =
                    truncatedInner.IndexOf("FilerID=", StringComparison.Ordinal) +
                    8; // 26758&FilerID=    C2017000427&Type=candidate
                var filerIdEndIdx =
                    truncatedInner.IndexOf("Type=", StringComparison.Ordinal) -
                    5; // 26758&FilerID=C2017000427    &Type=candidate

                var nameId  = truncatedInner.Substring(0, nameIdEndIdx + 1);
                var filerId = truncatedInner.Substring(filerIdStIdx, filerIdEndIdx - filerIdStIdx);

                var candidate = new Candidate {
                    NameId = nameId, FilerId = filerId, OfficeTypeId = formSearch.OfficeTypeId, OfficeName = formSearch.OfficeName
                };

                var last3TdStr = truncatedInner.Substring(truncatedInner.IndexOf("<td", StringComparison.Ordinal));
                var htmlDocTr  = new HtmlDocument();

                htmlDocTr.LoadHtml("<tr>" + last3TdStr.Trim() + "</tr");
                var trBody = htmlDocTr.DocumentNode.SelectNodes("//tr");

                var tdCounter = 0;

                foreach (var td in trBody.Descendants())
                {
                    var tdText = td.InnerText.Trim();

                    //  Was idx 0,5,10 for HtmlWeb.Load
                    switch (tdCounter++)
                    {
                    case 3:
                        candidate.CandidateName = tdText;
                        break;

                    case 7:

                        var year = 0;

                        if (!Int32.TryParse(tdText, out year))
                        {
                            candidate.Notes += "Empty Election Year.";

                            if (tdText != string.Empty)
                            {
                                CurrentStatus.LastOpMessage = $"Non-int year text: {tdText} for candidate {candidate.CandidateName}.";
                            }
                        }
                        candidate.Year = year;
                        break;

                    case 13:
                    case 16:

                        if (!DateTime.TryParse(tdText, out var dateDoiFiled))
                        {
                            candidate.Notes += "Empty DOI date.";

                            if (tdText != string.Empty)
                            {
                                CurrentStatus.LastOpMessage = $"Non-Date DOI text: {tdText} for candidate {candidate.CandidateName}.";
                            }
                        }
                        candidate.DoiFiled = dateDoiFiled;
                        break;

                    default:
                        // Log.Info($"Process table encountered unknown tdCounter value: {tdCounter}");
                        break;
                    }
                }
                rowIndex++;

                // Get the additional data: office district, affiliation, status
                AdditionalInfo.ReadThePage(candidate);

                Candidates.Add(candidate);
            }

            return(hasPageSelectorRow ? rowIndex - 1 : rowIndex - 2);
        }