Beispiel #1
0
        private void btnRegExChunkTest_Click(object sender, EventArgs e)
        {
            Scrape s = new Scrape();

            this.btnRegExChunkTest.Enabled  = false;
            this.txtRegExStart.Enabled      = false;
            this.chkStartIgnoreCase.Enabled = false;
            this.nudStartIteration.Enabled  = false;
            this.txtRegExEnd.Enabled        = false;
            this.chkEndIgnoreCase.Enabled   = false;
            this.nudEndIteration.Enabled    = false;
            this.lstMatchResultsREC.Enabled = false;
            this.txtMatchTextREC.Text       = string.Empty;
            this.txtMatchTextREC.Enabled    = false;
            this.lstMatchResultsREC.Items.Clear();
            this.lstMatchResultsREC.Enabled = false;
            this.Refresh();

            try
            {
                StringCollection results = s.RegExChunk(
                    this.txtCaptureRURL.Text,
                    this.txtRegExStart.Text,
                    this.chkStartIgnoreCase.Checked,
                    int.Parse(this.nudStartIteration.Value.ToString()),
                    this.txtRegExEnd.Text,
                    this.chkEndIgnoreCase.Checked,
                    int.Parse(this.nudEndIteration.Value.ToString()),
                    int.Parse(this.nudMaximumMatches.Value.ToString()),
                    int.Parse(this.nudMaxChunkLength.Value.ToString()));
                this.lstMatchResultsREC.Items.Clear();
                foreach (string str in results)
                {
                    this.lstMatchResultsREC.Items.Add(str);
                }
                if (this.lstMatchResultsREC.Items.Count > 0)
                {
                    this.lstMatchResultsREC.SelectedIndex = 0;
                }
            }
            catch (Exception ex)
            {
                this.txtMatchTextREC.Text = "FAIL";
                this.lstMatchResultsREC.Items.Clear();
                this.lstMatchResultsREC.Items.Add(DetailedException.WithEnterpriseContent(ref ex));
            }

            this.btnRegExChunkTest.Enabled  = true;
            this.txtRegExStart.Enabled      = true;
            this.chkStartIgnoreCase.Enabled = true;
            this.nudStartIteration.Enabled  = true;
            this.txtRegExEnd.Enabled        = true;
            this.chkEndIgnoreCase.Enabled   = true;
            this.nudEndIteration.Enabled    = true;
            this.lstMatchResultsREC.Enabled = true;
            this.txtMatchTextREC.Enabled    = true;
            this.lstMatchResultsREC.Enabled = true;
        }
Beispiel #2
0
        public void ScrapeTest_RegexChunk()
        {
            string source    = "<td>05/30/17</td><td>3</td><td>X2</td><td>05/31/17</td><td>2</td><td c=34>X4</td>";
            string startExpr = @">\d\d/\d\d/\d\d<";
            string endExpr   = @">X\d<";

            var scrape  = new Scrape();
            var results = scrape.RegExChunk(source, startExpr, true, 0, endExpr, true, 1, 0, 1000);

            Assert.That(results.Count == 2, string.Format("Count == {0}", results.Count));
            Assert.That(results.Count == 2, string.Format("Count == {0}", results.Count));
        }
Beispiel #3
0
        private void btnFullExample_Click(object sender, EventArgs e)
        {
            string        I_AM_CHROME = "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36";
            Scrape        s           = new Scrape();
            StringBuilder result      = new StringBuilder();

            result.AppendLine("Date\tNumbers\tPayoutInfo");

            this.btnFullExample.Enabled    = false;
            this.txtFullExampleResult.Text = string.Empty;
            this.lstFullExampleActivity.Items.Clear();
            this.dgvFullExample.Rows.Clear();
            this.tabControl4.TabPages["tabActivity"].Focus();
            this.Refresh();
            bool   Continue = true;
            string Captured = string.Empty;

            try
            {
                string SourceURL = string.Format(
                    "http://www.palottery.state.pa.us/Games/Print-Past-Winning-Numbers.aspx?id=12&year={0}&print=1",
                    (string)this.cbxYear.Items[this.cbxYear.SelectedIndex]);

                this.lstFullExampleActivity.Items.Add("Loading content from URL ...");
                this.lstFullExampleActivity.Items.Add(SourceURL);
                WebScrapeResponse r = s.HttpCapture(SourceURL, string.Empty, I_AM_CHROME, null);
                this.groupBox1.Text = string.Format("Done: ({0}) {1}", r.StatusCode, r.StatusText);
                if (!string.IsNullOrWhiteSpace(r.ErrorMessage))
                {
                    throw new Exception("Unexpected error message from HttpCapture: " + r.ErrorMessage);
                }
                if (r.StatusCode != "200")
                {
                    throw new Exception("Status code value was not 200.  Can not continue");
                }
                this.lstFullExampleActivity.Items.Add("Loading complete");
                Captured = r.Content;
            }
            catch (Exception err)
            {
                this.lstFullExampleActivity.Items.Add("Failed to retrieve URL: see result for error text");
                this.txtFullExampleResult.Text = DetailedException.WithUserContent(ref err);
                Continue = false;
            }

            if (Continue)
            {
                this.lstFullExampleActivity.Items.Add("Extract informational rows...");
                try
                {
                    StringCollection infoRows = s.RegExChunk(Captured, @"<tr>\s+<td>\d\d/\d\d/\d\d\d\d</td><td>\s+\d[\d]?\s+&nbsp;", true, 0, "</tr>", true, 1, 0, 0);
                    if (infoRows.Count == 0)
                    {
                        throw new Exception("No informational rows encountered.");
                    }
                    int rowIndex = 0;
                    foreach (string infoRow in infoRows)
                    {
                        StringCollection infoDraws = s.RegExChunk(infoRow, @"<td[>|\s]", true, 0, "</td>", true, 1, 0, 0);
                        if (infoDraws.Count != 3)
                        {
                            this.lstFullExampleActivity.Items.Add(string.Format("Can not read malformed row({0}): {1}", rowIndex, infoRow));
                            continue;
                        }

                        string[] values = new string[3];
                        for (int cellIndex = 0; cellIndex < 3; cellIndex++)
                        {
                            switch (cellIndex)
                            {
                            case 0:     // date
                                values[0] = s.RegExFind(infoDraws[cellIndex], @"([\d]?[\d]/){2}[\d]{4}", true, 0)[0];
                                result.Append(values[0]);
                                break;

                            case 1:     // numbers
                                // just get the digits and the semi-colons
                                string   numberRaw = Regex.Replace(infoDraws[cellIndex], @"[^0-9;]", string.Empty);
                                string[] n         = numberRaw.Split(new char[] { ';' }, StringSplitOptions.RemoveEmptyEntries);
                                if (n.Length != 7)
                                {
                                    this.lstFullExampleActivity.Items.Add(string.Format("Expected 7 numbers, but got {0} in \"{1}\"", n.Length, n));
                                    continue;
                                }
                                values[1] = string.Format("\t{0}-{1}-{2}-{3}-{4} [{5}] x{6}", n[0], n[1], n[2], n[3], n[4], n[5], n[6]);
                                result.Append("\t" + values[1]);
                                break;

                            case 2:     // link for payout stats
                                values[2] = "http://www.palottery.state.pa.us/Games" + s.RegExFind(infoDraws[cellIndex], "/Payouts.aspx[^\"]+", true, 0)[0];
                                result.AppendLine("\t" + values[2]);
                                break;
                            }
                        }
                        this.dgvFullExample.Rows.Add(values);
                    }
                    this.txtFullExampleResult.Text = result.ToString();
                    this.lstFullExampleActivity.Items.Add("Extraction successfully completed.  See result tab");
                }
                catch (Exception err)
                {
                    this.lstFullExampleActivity.Items.Add("Failed to retrieve URL: see result for error text");
                    this.txtFullExampleResult.Text = DetailedException.WithUserContent(ref err);
                    Continue = false;
                }
            }

            this.btnFullExample.Enabled = true;
            this.tabControl4.TabPages[Continue ? "tabResult" : "tabActivity"].Focus();
            this.dgvFullExample.Visible = Continue;
        }