Esempio n. 1
0
        static void Main(string[] args)
        {
            // SCRAPING PROCESS
            // Download page content
            // Use regular expressions & builder pattern to create search criteria - search through elements we want in HTML
            // scrape

            try
            {
                Console.WriteLine("Please enter which city you would like to scrape information from:");
                var craigslistCity = Console.ReadLine() ?? string.Empty;
                Console.WriteLine("Please enter the CraigsList category that you would like to scrape:");
                var craigslistCategoryName = Console.ReadLine() ?? string.Empty;

                using (WebClient client = new WebClient())
                {
                    string         content        = client.DownloadString($"http://{craigslistCity.Replace(" ", string.Empty)}.craigslist.org/{Method}/sss?query={craigslistCategoryName}");
                    ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder()
                                                    .WithData(content)
                                                    .WithRegex(@"<a href=\""(.*?)\"" data-id=\""(.*?)\"" class=\""result-title hdrlnk\"">(.*?)</a>") // regex to pick out specific things from html element <a>
                                                    .WithRegexOption(RegexOptions.ExplicitCapture)
                                                    .WithPart(new ScrapeCriteriaPartBuilder()
                                                              .WithRegex(@">(.*?)</a>") // specifying parts in the link - using builder within a builder
                                                              .WithRegexOption(RegexOptions.Singleline)
                                                              .Build())
                                                    .WithPart(new ScrapeCriteriaPartBuilder()
                                                              .WithRegex(@"href=\""(.*?)\""")
                                                              .WithRegexOption(RegexOptions.Singleline)
                                                              .Build())
                                                    .Build(); // Now building all of it


                    Scraper scraper = new Scraper();

                    var scrapedElements = scraper.Scrape(scrapeCriteria);

                    if (scrapedElements.Any())
                    {
                        foreach (var scrapedElement in scrapedElements)
                        {
                            Console.WriteLine(scrapedElement);
                        }
                    }
                    else
                    {
                        Console.WriteLine("There were no matches for the specified scrape criteria");
                    }
                }
            }
            catch (Exception ex)
            {
                Console.WriteLine(ex.Message);
            }
        }
Esempio n. 2
0
        static void Main(string[] args)
        {
            try
            {
                Console.Write("Enter the city to scrape information from: ");
                var craigslistCity = Console.ReadLine() ?? string.Empty;

                Console.Write("Enter the CraigList category: ");
                var craigslistCategory = Console.ReadLine() ?? string.Empty;

                using (WebClient client = new WebClient())
                {
                    string content = client.DownloadString($"http://{craigslistCity.Replace(" ", string.Empty)}.craigslist.org/{Method}/{craigslistCategory}");

                    ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder()
                                                    .WithData(content)
                                                    .WithRegex(@"<a href=\""(.*?)\"" data-id=\""(.*?)\"" class=\""result-title hdrlnk\"">(.*?)</a>")
                                                    .WithRegexOption(RegexOptions.ExplicitCapture)
                                                    .WithPart(new ScrapeCriteriaPartBuilder()
                                                              .WithRegex(@">(.*?)</a>")
                                                              .WithRegexOption(RegexOptions.Singleline)
                                                              .Build())
                                                    .WithPart(new ScrapeCriteriaPartBuilder()
                                                              .WithRegex(@"href=\""(.*?)\""")
                                                              .WithRegexOption(RegexOptions.Singleline)
                                                              .Build())
                                                    .Build();

                    Scraper scraper = new Scraper();

                    var scrapedElements = scraper.Scrape(scrapeCriteria);

                    if (scrapedElements.Any())
                    {
                        foreach (var scrapedElement in scrapedElements)
                        {
                            Console.WriteLine(scrapedElement);
                        }
                        ;
                    }
                    else
                    {
                        Console.WriteLine("There were no matches for the specified scrape criteria.");
                    }
                }
                Console.WriteLine();
            }
            catch (Exception ex)
            {
                Console.WriteLine(ex.Message);
            }
        }
Esempio n. 3
0
        static void Main(string[] args)
        {
            try
            {
                Console.WriteLine("Enter which city you would like to scrape info from:");
                var cityName = Console.ReadLine();

                Console.WriteLine("Enter which the CraigsList category:");
                var categoryName = Console.ReadLine();

                using (WebClient client = new WebClient())
                {
                    string content = client.DownloadString($"https://{cityName.Replace(" ", string.Empty)}.craigslist.org/{Method}/{categoryName}");

                    ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder()
                                                    .WithData(content)
                                                    .WithRegex(@"<a href=\""(.*?)\"" data-id=\""(.*?)\"" class=\""result-title hdrlnk\"">(.*?)</a>")
                                                    .WithRegexOption(RegexOptions.ExplicitCapture)
                                                    .WithParts(new ScrapeCriteriaPartBuilder()
                                                               .WithRegex(@">(.*?)</a>")
                                                               .WithRegexOption(RegexOptions.Singleline)
                                                               .Build())
                                                    .WithParts(new ScrapeCriteriaPartBuilder()
                                                               .WithRegex(@"href=\""(.*?)\""")
                                                               .WithRegexOption(RegexOptions.Singleline)
                                                               .Build())
                                                    .Build();

                    Scraper scraper          = new Scraper();
                    var     scraperdElements = scraper.Scrape(scrapeCriteria);

                    if (scraperdElements.Any())
                    {
                        foreach (var scrapedElement in scraperdElements)
                        {
                            Console.WriteLine(scrapedElement);
                        }
                    }
                    else
                    {
                        Console.WriteLine("there were no matches for scraping");
                    }
                }
                Console.ReadLine();
            }
            catch (Exception e)
            {
                Console.WriteLine(e.Message);
            }
        }
Esempio n. 4
0
        static void Main(string[] args)
        {
            Console.WriteLine("Please enter which city you would like to scrape:");

            var craigsListCity = Console.ReadLine() ?? string.Empty;

            Console.WriteLine("Please enter the CraigsList category you would like to scrape:");

            var craigsListCategory = Console.ReadLine() ?? string.Empty;

            using (WebClient client = new WebClient())
            {
                string content = client.DownloadString($"http://{craigsListCity.Replace(" ", string.Empty)}.craigslist.org/{Method}/{craigsListCategory}");

                ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder()
                                                .WithData(content)
                                                .WithRegex(@"<a href=\""(.*?)\"" data-id=\""(.*?)\"" class=\""result=title hdrlnk\"">(.*?)</a>")
                                                .WithRegexOption(System.Text.RegularExpressions.RegexOptions.ExplicitCapture)
                                                .WithPart(new ScrapeCriteriaPartBuilder()
                                                          .WithRegex(@">(.*?)</a>")
                                                          .WithRegexOption(System.Text.RegularExpressions.RegexOptions.Singleline)
                                                          .Build())
                                                .WithPart(new ScrapeCriteriaPartBuilder()
                                                          .WithRegex(@"href=\""(.*?)\""")
                                                          .WithRegexOption(System.Text.RegularExpressions.RegexOptions.Singleline)
                                                          .Build())
                                                .Build();

                Scraper scraper = new Scraper();

                var scrapedElements = scraper.Scrape(scrapeCriteria);

                if (scrapedElements.Any())
                {
                    foreach (var scrapedElement in scrapedElements)
                    {
                        Console.WriteLine(scrapedElement);
                    }
                }
                else
                {
                    Console.WriteLine("no matches");
                }
            }
        }
Esempio n. 5
0
        static void Main(string[] args)
        {
            try
            {
                Console.WriteLine("Please enter the city you would like to scrape information from:");
                var craigsListCity = Console.ReadLine() ?? String.Empty;

                WebData webData = new WebDataBuilder()
                                  .WithCity(craigsListCity)
                                  .Build();

                WebDownloader downloadContent = new WebDownloader();

                Content = downloadContent.DownloadContentFrom(webData);

                CategoryScraper scrapeCategory = new CategoryScraper();
                Categories = scrapeCategory.GetCategoryFrom(Content);

                var userCategory = "sss";

                if (Categories.Any())
                {
                    int x = Categories.Count;
                    for (int c = 0; c < x; c += 2)
                    {
                        Console.WriteLine("Category: {0}, Value: {1}", Categories[c + 1], Categories[c]);
                        Console.WriteLine();
                    }

                    Console.Write("Please enter the \"Value\" of the category you'd like to scrape elements from:");
                    userCategory = Console.ReadLine() ?? String.Empty;
                }
                else
                {
                    Console.WriteLine("There were no elements found in the category list.");
                    Console.Write("A default category will be chosen for you.");
                }

                webData = new WebDataBuilder()
                          .WithCity(craigsListCity)
                          .WithCategory(userCategory)
                          .Build();

                Content = downloadContent.DownloadContentFrom(webData);

                //Need to check for errors on userCategory

                // https://boston.craigslist.org/search //link example for city only
                // https://boston.craigslist.org/search/cta //link example w/ category

                ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder()
                                                .WithData(Content)
                                                .WithRegex(@"<a href=""(.*?)"" data-id=""(.*?)"" class=""(.*?)"">(.*?)</a>") //this regex pattern works
                                                .WithRegexOption(RegexOptions.ExplicitCapture)
                                                .WithParts(new ScrapeCriteriaPartBuilder()
                                                           .WithRegex(@">(.*?)<")
                                                           .WithRegexOption(RegexOptions.Singleline)
                                                           .Build())
                                                .WithParts(new ScrapeCriteriaPartBuilder()
                                                           .WithRegex(@"href=""(.*?)""")
                                                           .WithRegexOption(RegexOptions.Singleline)
                                                           .Build())
                                                .Build();

                Scraper scraper = new Scraper();

                var scrapedElements = scraper.Scrape(scrapeCriteria);

                if (scrapedElements.Any())
                {
                    int count = 1;
                    foreach (var scrapedElement in scrapedElements)
                    {
                        Console.WriteLine(scrapedElement);

                        if (count % 2 == 0)
                        {
                            Console.WriteLine();
                        }

                        count++;
                    }
                }
                else
                {
                    Console.WriteLine("There were no matches found for the specified scrape Criteria.");
                }
            }
            catch (Exception ex) { Console.WriteLine("There was an error found: {0}", ex.Message); }

            Console.WriteLine();
            Console.WriteLine("The program will close shortly, please acknowledge by pressing any key.");
            Console.ReadKey();
        }
Esempio n. 6
0
        static void Main(string[] args)
        {
            Log.Logger = SetupLogger();

            if (args.Length == 2)
            {
                using (AnonymousPipeClientStream pipeClientReader =
                           new AnonymousPipeClientStream(PipeDirection.In, args[0]))
                    using (PipeStream pipeClientWriter =
                               new AnonymousPipeClientStream(PipeDirection.Out, args[1]))
                    {
                        CrawlDescription crawlDescription;

                        // read crawl description from pipe
                        try
                        {
                            using (StreamReader sr = new StreamReader(pipeClientReader))
                            {
                                string message;

                                do
                                {
                                    // TODO(zvp) : have to exit eventually.
                                    message = sr.ReadLine();
                                    Log.Debug("Pipe Received Message: {0}", message);
                                } while (message == null || !message.StartsWith("SYNC"));

                                message          = sr.ReadLine();
                                crawlDescription = JsonConvert.DeserializeObject <CrawlDescription>(message);
                                Log.Debug("Pipe Received Crawl Description: {0}", message);
                            }

                            // process the message
                            CrawlResult crawlResult = null;
                            using (Scraper scraper = new Scraper(crawlDescription))
                            {
                                scraper.Initialize();
                                crawlResult = scraper.Scrape().GetAwaiter().GetResult();
                            }

                            using (StreamWriter sw = new StreamWriter(pipeClientWriter))
                            {
                                sw.AutoFlush = true;

                                // write Sync message and wait for drain
                                sw.WriteLine("SYNC");
                                pipeClientWriter.WaitForPipeDrain();

                                // write back the crawl result
                                string serializedCrawlResult = JsonConvert.SerializeObject(crawlResult);
                                sw.WriteLine(serializedCrawlResult);
                            }
                        }
                        catch (Exception ex)
                        {
                            Log.Error("WebScraper Exception({0}): {1}", ex.GetType(), ex.Message);
                        }
                    }
            }
            else
            {
                Log.Error("Expected 2 Arguments (PipeWriteHandle and PipeReadHandle).");
            }
        }