private void btnRegExChunkTest_Click(object sender, EventArgs e) { Scrape s = new Scrape(); this.btnRegExChunkTest.Enabled = false; this.txtRegExStart.Enabled = false; this.chkStartIgnoreCase.Enabled = false; this.nudStartIteration.Enabled = false; this.txtRegExEnd.Enabled = false; this.chkEndIgnoreCase.Enabled = false; this.nudEndIteration.Enabled = false; this.lstMatchResultsREC.Enabled = false; this.txtMatchTextREC.Text = string.Empty; this.txtMatchTextREC.Enabled = false; this.lstMatchResultsREC.Items.Clear(); this.lstMatchResultsREC.Enabled = false; this.Refresh(); try { StringCollection results = s.RegExChunk( this.txtCaptureRURL.Text, this.txtRegExStart.Text, this.chkStartIgnoreCase.Checked, int.Parse(this.nudStartIteration.Value.ToString()), this.txtRegExEnd.Text, this.chkEndIgnoreCase.Checked, int.Parse(this.nudEndIteration.Value.ToString()), int.Parse(this.nudMaximumMatches.Value.ToString()), int.Parse(this.nudMaxChunkLength.Value.ToString())); this.lstMatchResultsREC.Items.Clear(); foreach (string str in results) { this.lstMatchResultsREC.Items.Add(str); } if (this.lstMatchResultsREC.Items.Count > 0) { this.lstMatchResultsREC.SelectedIndex = 0; } } catch (Exception ex) { this.txtMatchTextREC.Text = "FAIL"; this.lstMatchResultsREC.Items.Clear(); this.lstMatchResultsREC.Items.Add(DetailedException.WithEnterpriseContent(ref ex)); } this.btnRegExChunkTest.Enabled = true; this.txtRegExStart.Enabled = true; this.chkStartIgnoreCase.Enabled = true; this.nudStartIteration.Enabled = true; this.txtRegExEnd.Enabled = true; this.chkEndIgnoreCase.Enabled = true; this.nudEndIteration.Enabled = true; this.lstMatchResultsREC.Enabled = true; this.txtMatchTextREC.Enabled = true; this.lstMatchResultsREC.Enabled = true; }
public void ScrapeTest_RegexChunk() { string source = "<td>05/30/17</td><td>3</td><td>X2</td><td>05/31/17</td><td>2</td><td c=34>X4</td>"; string startExpr = @">\d\d/\d\d/\d\d<"; string endExpr = @">X\d<"; var scrape = new Scrape(); var results = scrape.RegExChunk(source, startExpr, true, 0, endExpr, true, 1, 0, 1000); Assert.That(results.Count == 2, string.Format("Count == {0}", results.Count)); Assert.That(results.Count == 2, string.Format("Count == {0}", results.Count)); }
private void btnFullExample_Click(object sender, EventArgs e) { string I_AM_CHROME = "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36"; Scrape s = new Scrape(); StringBuilder result = new StringBuilder(); result.AppendLine("Date\tNumbers\tPayoutInfo"); this.btnFullExample.Enabled = false; this.txtFullExampleResult.Text = string.Empty; this.lstFullExampleActivity.Items.Clear(); this.dgvFullExample.Rows.Clear(); this.tabControl4.TabPages["tabActivity"].Focus(); this.Refresh(); bool Continue = true; string Captured = string.Empty; try { string SourceURL = string.Format( "http://www.palottery.state.pa.us/Games/Print-Past-Winning-Numbers.aspx?id=12&year={0}&print=1", (string)this.cbxYear.Items[this.cbxYear.SelectedIndex]); this.lstFullExampleActivity.Items.Add("Loading content from URL ..."); this.lstFullExampleActivity.Items.Add(SourceURL); WebScrapeResponse r = s.HttpCapture(SourceURL, string.Empty, I_AM_CHROME, null); this.groupBox1.Text = string.Format("Done: ({0}) {1}", r.StatusCode, r.StatusText); if (!string.IsNullOrWhiteSpace(r.ErrorMessage)) { throw new Exception("Unexpected error message from HttpCapture: " + r.ErrorMessage); } if (r.StatusCode != "200") { throw new Exception("Status code value was not 200. Can not continue"); } this.lstFullExampleActivity.Items.Add("Loading complete"); Captured = r.Content; } catch (Exception err) { this.lstFullExampleActivity.Items.Add("Failed to retrieve URL: see result for error text"); this.txtFullExampleResult.Text = DetailedException.WithUserContent(ref err); Continue = false; } if (Continue) { this.lstFullExampleActivity.Items.Add("Extract informational rows..."); try { StringCollection infoRows = s.RegExChunk(Captured, @"<tr>\s+<td>\d\d/\d\d/\d\d\d\d</td><td>\s+\d[\d]?\s+ ", true, 0, "</tr>", true, 1, 0, 0); if (infoRows.Count == 0) { throw new Exception("No informational rows encountered."); } int rowIndex = 0; foreach (string infoRow in infoRows) { StringCollection infoDraws = s.RegExChunk(infoRow, @"<td[>|\s]", true, 0, "</td>", true, 1, 0, 0); if (infoDraws.Count != 3) { this.lstFullExampleActivity.Items.Add(string.Format("Can not read malformed row({0}): {1}", rowIndex, infoRow)); continue; } string[] values = new string[3]; for (int cellIndex = 0; cellIndex < 3; cellIndex++) { switch (cellIndex) { case 0: // date values[0] = s.RegExFind(infoDraws[cellIndex], @"([\d]?[\d]/){2}[\d]{4}", true, 0)[0]; result.Append(values[0]); break; case 1: // numbers // just get the digits and the semi-colons string numberRaw = Regex.Replace(infoDraws[cellIndex], @"[^0-9;]", string.Empty); string[] n = numberRaw.Split(new char[] { ';' }, StringSplitOptions.RemoveEmptyEntries); if (n.Length != 7) { this.lstFullExampleActivity.Items.Add(string.Format("Expected 7 numbers, but got {0} in \"{1}\"", n.Length, n)); continue; } values[1] = string.Format("\t{0}-{1}-{2}-{3}-{4} [{5}] x{6}", n[0], n[1], n[2], n[3], n[4], n[5], n[6]); result.Append("\t" + values[1]); break; case 2: // link for payout stats values[2] = "http://www.palottery.state.pa.us/Games" + s.RegExFind(infoDraws[cellIndex], "/Payouts.aspx[^\"]+", true, 0)[0]; result.AppendLine("\t" + values[2]); break; } } this.dgvFullExample.Rows.Add(values); } this.txtFullExampleResult.Text = result.ToString(); this.lstFullExampleActivity.Items.Add("Extraction successfully completed. See result tab"); } catch (Exception err) { this.lstFullExampleActivity.Items.Add("Failed to retrieve URL: see result for error text"); this.txtFullExampleResult.Text = DetailedException.WithUserContent(ref err); Continue = false; } } this.btnFullExample.Enabled = true; this.tabControl4.TabPages[Continue ? "tabResult" : "tabActivity"].Focus(); this.dgvFullExample.Visible = Continue; }