Example #1
0
        private void StartScraping_Click(object sender, EventArgs e)
        {
            // Initialize the SQLite Connection
            IDbConnection conn = new SQLiteConnection("Data Source=" + pathDB.Text + ";Version=3;");

            SimpleCRUD.SetDialect(SimpleCRUD.Dialect.SQLite);
            conn.Open();

            // Initialize WebDriver
            var        url    = projectURL.SelectedValue.ToString();
            IWebDriver driver = Scraper.OpenWebSite(url, SelectedDriver.SelectedItem.ToString());

            Scraper.CleanDB(conn);
            Scraper.ParsingNode1(driver, conn);
            Scraper.ParsingNode2(driver, conn);
            Scraper.ParsingNode2a(driver, conn);
            Scraper.ParsingNode3(driver, conn);
            Scraper.ParsingNode4(driver, conn);
            Scraper.ParsingNode5(driver, conn);
            Scraper.UpdFieledName(driver, conn);

            conn.Close();
        }
Example #2
0
        private void backgroundWorker1_DoWork(object sender, DoWorkEventArgs e)
        {
            // Configure web scraper object
            Scraper scraper = new Scraper(SessionData.Url, SessionData.Container, SessionData.Item)
            {
                NextPageSelector   = SessionData.NextPage,
                DataSeparator      = SessionData.DataSeparator,
                WriteColumnHeaders = Settings.WriteColumnHeaders
            };

            scraper.Placeholders.AddRange(SessionData.Placeholders);
            scraper.Fields.AddRange(SessionData.Fields);
            scraper.UpdateProgress += Scraper_UpdateProgress;

            Task task = scraper.RunAsync(CsvFile);

            task.Wait();

            e.Result = new ScanResult
            {
                UrlsScanned = scraper.UrlsScanned,
                UrlErrors   = scraper.UrlErrors,
            };
        }
Example #3
0
        static void Main(string[] args)
        {
            try
            {
                Console.WriteLine("Please enter the city you would like to scrape information from:");
                var craigsListCity = Console.ReadLine() ?? String.Empty;

                WebData webData = new WebDataBuilder()
                                  .WithCity(craigsListCity)
                                  .Build();

                WebDownloader downloadContent = new WebDownloader();

                Content = downloadContent.DownloadContentFrom(webData);

                CategoryScraper scrapeCategory = new CategoryScraper();
                Categories = scrapeCategory.GetCategoryFrom(Content);

                var userCategory = "sss";

                if (Categories.Any())
                {
                    int x = Categories.Count;
                    for (int c = 0; c < x; c += 2)
                    {
                        Console.WriteLine("Category: {0}, Value: {1}", Categories[c + 1], Categories[c]);
                        Console.WriteLine();
                    }

                    Console.Write("Please enter the \"Value\" of the category you'd like to scrape elements from:");
                    userCategory = Console.ReadLine() ?? String.Empty;
                }
                else
                {
                    Console.WriteLine("There were no elements found in the category list.");
                    Console.Write("A default category will be chosen for you.");
                }

                webData = new WebDataBuilder()
                          .WithCity(craigsListCity)
                          .WithCategory(userCategory)
                          .Build();

                Content = downloadContent.DownloadContentFrom(webData);

                //Need to check for errors on userCategory

                // https://boston.craigslist.org/search //link example for city only
                // https://boston.craigslist.org/search/cta //link example w/ category

                ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder()
                                                .WithData(Content)
                                                .WithRegex(@"<a href=""(.*?)"" data-id=""(.*?)"" class=""(.*?)"">(.*?)</a>") //this regex pattern works
                                                .WithRegexOption(RegexOptions.ExplicitCapture)
                                                .WithParts(new ScrapeCriteriaPartBuilder()
                                                           .WithRegex(@">(.*?)<")
                                                           .WithRegexOption(RegexOptions.Singleline)
                                                           .Build())
                                                .WithParts(new ScrapeCriteriaPartBuilder()
                                                           .WithRegex(@"href=""(.*?)""")
                                                           .WithRegexOption(RegexOptions.Singleline)
                                                           .Build())
                                                .Build();

                Scraper scraper = new Scraper();

                var scrapedElements = scraper.Scrape(scrapeCriteria);

                if (scrapedElements.Any())
                {
                    int count = 1;
                    foreach (var scrapedElement in scrapedElements)
                    {
                        Console.WriteLine(scrapedElement);

                        if (count % 2 == 0)
                        {
                            Console.WriteLine();
                        }

                        count++;
                    }
                }
                else
                {
                    Console.WriteLine("There were no matches found for the specified scrape Criteria.");
                }
            }
            catch (Exception ex) { Console.WriteLine("There was an error found: {0}", ex.Message); }

            Console.WriteLine();
            Console.WriteLine("The program will close shortly, please acknowledge by pressing any key.");
            Console.ReadKey();
        }
Example #4
0
        static void Main(string[] args)
        {
            Log.Logger = SetupLogger();

            if (args.Length == 2)
            {
                using (AnonymousPipeClientStream pipeClientReader =
                           new AnonymousPipeClientStream(PipeDirection.In, args[0]))
                    using (PipeStream pipeClientWriter =
                               new AnonymousPipeClientStream(PipeDirection.Out, args[1]))
                    {
                        CrawlDescription crawlDescription;

                        // read crawl description from pipe
                        try
                        {
                            using (StreamReader sr = new StreamReader(pipeClientReader))
                            {
                                string message;

                                do
                                {
                                    // TODO(zvp) : have to exit eventually.
                                    message = sr.ReadLine();
                                    Log.Debug("Pipe Received Message: {0}", message);
                                } while (message == null || !message.StartsWith("SYNC"));

                                message          = sr.ReadLine();
                                crawlDescription = JsonConvert.DeserializeObject <CrawlDescription>(message);
                                Log.Debug("Pipe Received Crawl Description: {0}", message);
                            }

                            // process the message
                            CrawlResult crawlResult = null;
                            using (Scraper scraper = new Scraper(crawlDescription))
                            {
                                scraper.Initialize();
                                crawlResult = scraper.Scrape().GetAwaiter().GetResult();
                            }

                            using (StreamWriter sw = new StreamWriter(pipeClientWriter))
                            {
                                sw.AutoFlush = true;

                                // write Sync message and wait for drain
                                sw.WriteLine("SYNC");
                                pipeClientWriter.WaitForPipeDrain();

                                // write back the crawl result
                                string serializedCrawlResult = JsonConvert.SerializeObject(crawlResult);
                                sw.WriteLine(serializedCrawlResult);
                            }
                        }
                        catch (Exception ex)
                        {
                            Log.Error("WebScraper Exception({0}): {1}", ex.GetType(), ex.Message);
                        }
                    }
            }
            else
            {
                Log.Error("Expected 2 Arguments (PipeWriteHandle and PipeReadHandle).");
            }
        }