private void StartScraping_Click(object sender, EventArgs e) { // Initialize the SQLite Connection IDbConnection conn = new SQLiteConnection("Data Source=" + pathDB.Text + ";Version=3;"); SimpleCRUD.SetDialect(SimpleCRUD.Dialect.SQLite); conn.Open(); // Initialize WebDriver var url = projectURL.SelectedValue.ToString(); IWebDriver driver = Scraper.OpenWebSite(url, SelectedDriver.SelectedItem.ToString()); Scraper.CleanDB(conn); Scraper.ParsingNode1(driver, conn); Scraper.ParsingNode2(driver, conn); Scraper.ParsingNode2a(driver, conn); Scraper.ParsingNode3(driver, conn); Scraper.ParsingNode4(driver, conn); Scraper.ParsingNode5(driver, conn); Scraper.UpdFieledName(driver, conn); conn.Close(); }
private void backgroundWorker1_DoWork(object sender, DoWorkEventArgs e) { // Configure web scraper object Scraper scraper = new Scraper(SessionData.Url, SessionData.Container, SessionData.Item) { NextPageSelector = SessionData.NextPage, DataSeparator = SessionData.DataSeparator, WriteColumnHeaders = Settings.WriteColumnHeaders }; scraper.Placeholders.AddRange(SessionData.Placeholders); scraper.Fields.AddRange(SessionData.Fields); scraper.UpdateProgress += Scraper_UpdateProgress; Task task = scraper.RunAsync(CsvFile); task.Wait(); e.Result = new ScanResult { UrlsScanned = scraper.UrlsScanned, UrlErrors = scraper.UrlErrors, }; }
static void Main(string[] args) { try { Console.WriteLine("Please enter the city you would like to scrape information from:"); var craigsListCity = Console.ReadLine() ?? String.Empty; WebData webData = new WebDataBuilder() .WithCity(craigsListCity) .Build(); WebDownloader downloadContent = new WebDownloader(); Content = downloadContent.DownloadContentFrom(webData); CategoryScraper scrapeCategory = new CategoryScraper(); Categories = scrapeCategory.GetCategoryFrom(Content); var userCategory = "sss"; if (Categories.Any()) { int x = Categories.Count; for (int c = 0; c < x; c += 2) { Console.WriteLine("Category: {0}, Value: {1}", Categories[c + 1], Categories[c]); Console.WriteLine(); } Console.Write("Please enter the \"Value\" of the category you'd like to scrape elements from:"); userCategory = Console.ReadLine() ?? String.Empty; } else { Console.WriteLine("There were no elements found in the category list."); Console.Write("A default category will be chosen for you."); } webData = new WebDataBuilder() .WithCity(craigsListCity) .WithCategory(userCategory) .Build(); Content = downloadContent.DownloadContentFrom(webData); //Need to check for errors on userCategory // https://boston.craigslist.org/search //link example for city only // https://boston.craigslist.org/search/cta //link example w/ category ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder() .WithData(Content) .WithRegex(@"<a href=""(.*?)"" data-id=""(.*?)"" class=""(.*?)"">(.*?)</a>") //this regex pattern works .WithRegexOption(RegexOptions.ExplicitCapture) .WithParts(new ScrapeCriteriaPartBuilder() .WithRegex(@">(.*?)<") .WithRegexOption(RegexOptions.Singleline) .Build()) .WithParts(new ScrapeCriteriaPartBuilder() .WithRegex(@"href=""(.*?)""") .WithRegexOption(RegexOptions.Singleline) .Build()) .Build(); Scraper scraper = new Scraper(); var scrapedElements = scraper.Scrape(scrapeCriteria); if (scrapedElements.Any()) { int count = 1; foreach (var scrapedElement in scrapedElements) { Console.WriteLine(scrapedElement); if (count % 2 == 0) { Console.WriteLine(); } count++; } } else { Console.WriteLine("There were no matches found for the specified scrape Criteria."); } } catch (Exception ex) { Console.WriteLine("There was an error found: {0}", ex.Message); } Console.WriteLine(); Console.WriteLine("The program will close shortly, please acknowledge by pressing any key."); Console.ReadKey(); }
static void Main(string[] args) { Log.Logger = SetupLogger(); if (args.Length == 2) { using (AnonymousPipeClientStream pipeClientReader = new AnonymousPipeClientStream(PipeDirection.In, args[0])) using (PipeStream pipeClientWriter = new AnonymousPipeClientStream(PipeDirection.Out, args[1])) { CrawlDescription crawlDescription; // read crawl description from pipe try { using (StreamReader sr = new StreamReader(pipeClientReader)) { string message; do { // TODO(zvp) : have to exit eventually. message = sr.ReadLine(); Log.Debug("Pipe Received Message: {0}", message); } while (message == null || !message.StartsWith("SYNC")); message = sr.ReadLine(); crawlDescription = JsonConvert.DeserializeObject <CrawlDescription>(message); Log.Debug("Pipe Received Crawl Description: {0}", message); } // process the message CrawlResult crawlResult = null; using (Scraper scraper = new Scraper(crawlDescription)) { scraper.Initialize(); crawlResult = scraper.Scrape().GetAwaiter().GetResult(); } using (StreamWriter sw = new StreamWriter(pipeClientWriter)) { sw.AutoFlush = true; // write Sync message and wait for drain sw.WriteLine("SYNC"); pipeClientWriter.WaitForPipeDrain(); // write back the crawl result string serializedCrawlResult = JsonConvert.SerializeObject(crawlResult); sw.WriteLine(serializedCrawlResult); } } catch (Exception ex) { Log.Error("WebScraper Exception({0}): {1}", ex.GetType(), ex.Message); } } } else { Log.Error("Expected 2 Arguments (PipeWriteHandle and PipeReadHandle)."); } }