Пример #1
1
        private static void GetIshadowsocksServers(List<ServerConfig> serverList)
        {
            var browser = new ScrapingBrowser();

            //set UseDefaultCookiesParser as false if a website returns invalid cookies format
            //browser.UseDefaultCookiesParser = false;
            Console.WriteLine("Open website http://www.ishadowsocks.org/");
            var homePage = browser.NavigateToPage(new Uri("http://www.ishadowsocks.org/"));

            var freeSection = homePage.Find("Section", By.Id("free")).FirstOrDefault();
            if (freeSection == null)
            {
                Console.WriteLine("Can't find the Free section.");
            }

            var serverNodes = homePage.Html.CssSelect("#free  >div.container > div.row > div.col-sm-4");

            Console.WriteLine("Read Servers from HTML");

            Console.WriteLine("Parse the server html");
            foreach (var serverNode in serverNodes)
            {
                var h4nodes = serverNode.ChildNodes.Where(n => n.Name.Contains("h4")).ToList();
                var server = new ServerConfig()
                {
                    server = h4nodes[0].InnerText.Split(':')[1],
                    server_port = int.Parse(h4nodes[1].InnerText.Split(':')[1]),
                    password = h4nodes[2].InnerText.Split(':')[1],
                    method = h4nodes[3].InnerText.Split(':')[1],
                    remarks = h4nodes[0].InnerText.Split(':')[1],
                };
                serverList.Add(server);
            }
        }
Пример #2
0
        private async void MainForm_Load(object sender, EventArgs e)
        {
            ScrapingBrowser browser = new ScrapingBrowser();

            browser.Encoding = Encoding.UTF8;

            //set UseDefaultCookiesParser as false if a website returns invalid cookies format
            //browser.UseDefaultCookiesParser = false;
            WebPage homePage = null;

            //WebPage homePage = await browser.NavigateToPageAsync(new Uri("http://adreskodu.dask.gov.tr/"));
            //Debug.WriteLine(homePage.Html.OuterHtml);
            homePage = await browser.NavigateToPageAsync(new Uri("http://adreskodu.dask.gov.tr/site-element/control/load.ashx"),
                                                         HttpVerb.Post, "fZwxqvPOpAViHEOXXVuXBaTd+2018072821lryxuVH5Y45L6Yb8Wo05CB46f1vrJzTCd1vDE8EiSLYLbaFSn0O0MhabkTx4t3A+Q==&t=il&u=0");

            Debug.WriteLine(homePage.Html.OuterHtml);
            _iller = JsonConvert.DeserializeObject <ReturnData>(homePage.Html.OuterHtml);
            comboBox1.Items.Clear();
            foreach (var city in _iller.yt)
            {
                comboBox1.Items.Add(city.text);
            }
            comboBox1.SelectedIndex = 0;

            //HtmlNode[] resultsLinks = resultsPage.Html.CssSelect("div.sb_tlst h3 a").ToArray();

            //WebPage blogPage = resultsPage.FindLinks(By.Text("romcyber blog | Just another WordPress site")).Single().Click();
        }
Пример #3
0
        private async void comboBox7_SelectedIndexChanged(object sender, EventArgs e)
        {
            var index = comboBox7.SelectedIndex;

            if (index == 0)
            {
                return;
            }
            index = index - 1;
            var dairekodu = _daireler[index].GetAttributeValue("id", "").Substring(1);

            var             postData = $"fZwxqvPOpAViHEOXXVuXBaTd+2018072821lryxuVH5Y45L6Yb8Wo05CB46f1vrJzTCd1vDE8EiSLYLbaFSn0O0MhabkTx4t3A+Q==&t=adr&u={dairekodu}";
            ScrapingBrowser browser  = new ScrapingBrowser();

            browser.Encoding = Encoding.UTF8;
            WebPage homePage = null;

            homePage = await browser.NavigateToPageAsync(new Uri("http://adreskodu.dask.gov.tr/site-element/control/load.ashx"),
                                                         HttpVerb.Post, postData);

            var adres = homePage.Html.InnerText;

            adres            += "\r\nAdres Kodu:" + dairekodu;
            richTextBox1.Text = adres;
        }
Пример #4
0
        private async void comboBox6_SelectedIndexChanged(object sender, EventArgs e)
        {
            var index = comboBox6.SelectedIndex;

            if (index == 0)
            {
                return;
            }
            index = index - 1;
            var binakodu = _binalar[index].GetAttributeValue("id", "").Substring(1);

            var             postData = $"fZwxqvPOpAViHEOXXVuXBaTd+2018072821lryxuVH5Y45L6Yb8Wo05CB46f1vrJzTCd1vDE8EiSLYLbaFSn0O0MhabkTx4t3A+Q==&t=ick&u={binakodu}&term=";
            ScrapingBrowser browser  = new ScrapingBrowser();

            browser.Encoding = Encoding.UTF8;
            WebPage homePage = null;

            homePage = await browser.NavigateToPageAsync(new Uri("http://adreskodu.dask.gov.tr/site-element/control/load.ashx"),
                                                         HttpVerb.Post, postData);

            _daireler = homePage.Html.SelectNodes("//tbody/tr");
            comboBox7.Items.Clear();
            comboBox7.Items.Add("SEÇİNİZ");

            foreach (var daire in _daireler)
            {
                var hucreler  = daire.SelectNodes("td");
                var daireIsmi = hucreler[0].InnerText + "-" + hucreler[1].InnerText;
                daireIsmi = daireIsmi.Replace("&nbsp;", " ");
                comboBox7.Items.Add(daireIsmi);

                Debug.WriteLine("Adres Kodu: " + daire.GetAttributeValue("id", "").Substring(1));
            }
            comboBox7.SelectedIndex = 0;
        }
Пример #5
0
        private async void comboBox4_SelectedIndexChanged(object sender, EventArgs e)
        {
            var index = comboBox4.SelectedIndex;

            if (index == 0)
            {
                return;
            }
            var             mahallekodu = _mahalleler.yt[index].value;
            var             postData    = $"fZwxqvPOpAViHEOXXVuXBaTd+2018072821lryxuVH5Y45L6Yb8Wo05CB46f1vrJzTCd1vDE8EiSLYLbaFSn0O0MhabkTx4t3A+Q==&t=sf&u={mahallekodu}&term=";
            ScrapingBrowser browser     = new ScrapingBrowser();

            browser.Encoding = Encoding.UTF8;
            WebPage homePage = null;

            homePage = await browser.NavigateToPageAsync(new Uri("http://adreskodu.dask.gov.tr/site-element/control/load.ashx"),
                                                         HttpVerb.Post, postData);

            _caddeler = homePage.Html.SelectNodes("//tbody/tr");
            comboBox5.Items.Clear();
            comboBox5.Items.Add("SEÇİNİZ");

            foreach (var cadde in _caddeler)
            {
                var hucreler  = cadde.SelectNodes("td");
                var caddeIsmi = hucreler[0].InnerText + "-" + hucreler[1].InnerText;
                caddeIsmi = caddeIsmi.Replace("&nbsp;", " ");
                comboBox5.Items.Add(caddeIsmi);
            }
            comboBox5.SelectedIndex = 0;
            _mahalleKodu            = mahallekodu;
            _mahalle            = _mahalleler.yt[index].text;
            buttonStart.Enabled = true;
            buttonStop.Enabled  = false;
        }
Пример #6
0
        private async void comboBox3_SelectedIndexChanged(object sender, EventArgs e)
        {
            var index = comboBox3.SelectedIndex;

            if (index == 0)
            {
                return;
            }
            var             koykodu  = _koyler.yt[index].value;
            var             postData = "fZwxqvPOpAViHEOXXVuXBaTd+2018072821lryxuVH5Y45L6Yb8Wo05CB46f1vrJzTCd1vDE8EiSLYLbaFSn0O0MhabkTx4t3A+Q==&t=mh&u=";
            ScrapingBrowser browser  = new ScrapingBrowser();

            browser.Encoding = Encoding.UTF8;
            WebPage homePage = null;

            homePage = await browser.NavigateToPageAsync(new Uri("http://adreskodu.dask.gov.tr/site-element/control/load.ashx"),
                                                         HttpVerb.Post, postData + koykodu);

            _mahalleler = JsonConvert.DeserializeObject <ReturnData>(homePage.Html.OuterHtml);
            comboBox4.Items.Clear();
            foreach (var mahalle in _mahalleler.yt)
            {
                comboBox4.Items.Add(mahalle.text);
            }
            comboBox4.SelectedIndex = 0;
            _koyKodu            = koykodu;
            _koy                = _koyler.yt[index].text;
            buttonStart.Enabled = false;
            buttonStop.Enabled  = false;
        }
Пример #7
0
 public GoogleMatchService()
 {
     browser    = new ScrapingBrowser();
     ignoreList = new List <string> {
         "how are you", "what is that", "how is it going"
     };
 }
Пример #8
0
        public HtmlNode GetNcdcHtmlData()
        {
            var     scrapingBrowser = new ScrapingBrowser();
            WebPage page            = scrapingBrowser.NavigateToPage(new Uri(NodeUrlHelper.NcdcReportUrl));

            return(page.Html);
        }
Пример #9
0
 public IgromaniaNewsParser()
 {
     web = new ScrapingBrowser();
     web.IgnoreCookies = false;
     web.Encoding      = System.Text.Encoding.UTF8;
     newsItems         = new List <NewsItemInfo>();
 }
Пример #10
0
        public static HtmlNode GetHtml(string url)
        {
            ScrapingBrowser browser = new ScrapingBrowser();
            WebPage         webpage = browser.NavigateToPage(new Uri(url));

            return(webpage.Html);
        }
Пример #11
0
        private static HtmlNode GetHtml(string url)
        {
            var browser = new ScrapingBrowser();
            var webpage = browser.NavigateToPage(new Uri(url));

            return(webpage.Html);
        }
Пример #12
0
        public int GetValuationRank(string ticker)
        {
            ScrapingBrowser browser = new ScrapingBrowser();
            WebPage         page;

            try
            {
                page = browser.NavigateToPage(new Uri($"https://www.gurufocus.com/stock/{ticker}/summary"));
            }
            catch
            {
                return(0);
            }
            int ValRank   = 0;
            var divs      = page.Html.CssSelect("div");
            var divRatios = divs.FirstOrDefault(x => x.Attributes["id"]?.Value == "ratios");

            if (divRatios != null)
            {
                var   tdElements = divRatios.CssSelect("td");
                Regex rgx        = new Regex(@"\d/\d");
                var   tdValRank  = tdElements.FirstOrDefault(x => rgx.IsMatch(x.InnerText));
                if (tdValRank != null)
                {
                    int.TryParse(tdValRank.InnerText.Split('/').FirstOrDefault(), out ValRank);
                }
            }
            return(ValRank);
        }
        private string[] GetWikiMonsters()
        {
            string         monsterlisturl = $"https://tibia.fandom.com/wiki/List_of_Creatures_(Ordered)";
            IList <string> names          = new List <string>();

            ScrapingBrowser browser = new ScrapingBrowser();

            browser.Encoding = Encoding.UTF8;
            WebPage monsterspage = browser.NavigateToPage(new Uri(monsterlisturl));
            var     orderedLists = monsterspage.Html.CssSelect("ol");

            // Links are HTML encoded
            // %27 is HTML encode for ' character
            // %27%C3% is HTML encode for ñ character
            var nameregex = new Regex("/wiki/(?<name>[[a-zA-Z.()_%27%C3%B1-]+)");

            foreach (var ol in orderedLists)
            {
                foreach (var child in ol.ChildNodes)
                {
                    if (nameregex.IsMatch(child.InnerHtml))
                    {
                        var namematches = nameregex.Matches(child.InnerHtml);
                        names.Add(namematches.FindNamedGroupValue("name").Replace("%27", "'").Replace("%C3%B1", "ñ"));
                    }
                }
            }

            return(names.ToArray());
        }
Пример #14
0
        private async Task <string> ScrapePage()
        {
            var Browser    = new ScrapingBrowser();
            var pageResult = await Browser.NavigateToPageAsync(new Uri(SCRAPING_PATH));

            return(pageResult.Html.CssSelect("p>a").Last().InnerText);
        }
Пример #15
0
        void getanslink(string name, string url, DataTable dt_sa, DataRow r)
        {
            List <String> qa = new List <String>();

            qa.Clear();
            var uri      = new Uri(url.ToString());
            var browser1 = new ScrapingBrowser();

            browser1.UserAgent = FakeUserAgents.Chrome;
            var html1        = browser1.DownloadString(uri);
            var htmlDocument = new HtmlAgilityPack.HtmlDocument();

            htmlDocument.LoadHtml(html1);
            var      html = htmlDocument.DocumentNode;
            JsonData data = JsonMapper.ToObject(html1);

            for (int i = 0; i < data.Count; i++)
            {
                qa.Add(data[i]["AnswerLink"].ToString());
            }
            if (qa.Count() != 0)
            {
                string json_data = JsonConvert.SerializeObject(qa);
                r["Answers"] = json_data.ToString();
                getuser(name, qa, dt_sa, r);
            }
        }
        public async Task Short(CommandContext ctx, [RemainingText] string nickname)
        {
            await ctx.TriggerTypingAsync();

            ScrapingBrowser browser = new ScrapingBrowser();

            //set UseDefaultCookiesParser as false if a website returns invalid cookies format
            //browser.UseDefaultCookiesParser = false;

            WebPage homePage = browser.NavigateToPage(new Uri("http://divisiontracker.com/profile/uplay/" + nickname));

            List <HtmlNode> resultNames = homePage.Html.CssSelect("div.stats-stat>div.name").ToList();

            List <HtmlNode> resultValues = homePage.Html.CssSelect("div.stats-stat>div.value").ToList();

            Dictionary <string, string> namesAndValues = new Dictionary <string, string>();

            for (int i = 0, j = 0; i < resultNames.Count; i++, j++)
            {
                namesAndValues.Add(resultNames[i].InnerText, resultValues[j].InnerText);
            }

            DiscordEmbedBuilder builder = new DiscordEmbedBuilder();

            builder.WithTitle($"{ctx.Client.CurrentUser.Username}\n{ctx.Command.Name}\n{nickname}").WithDescription($"PLAYTIME: {namesAndValues["\nPlaytime \n"]}ROGUE PLAYERS KILLED: \n{namesAndValues["\nRogue Players Killed \n"]}").WithColor(DiscordColor.Orange);

            await ctx.RespondAsync("", false, builder.Build());
        }
Пример #17
0
        static void Main(string[] args)
        {
            Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
            Console.WriteLine("Hello World!");

            Manga manga = new Manga
            {
                Name     = "擅长捉弄的高木同学",
                Charset  = "GBK",
                BaseUrl  = "http://comic.ikkdm.com",
                IndexUrl = "http://comic.ikkdm.com/comiclist/2112/index.htm"
            };

            ScrapingBrowser browser = new ScrapingBrowser();

            browser.Encoding = Encoding.GetEncoding(manga.Charset);
            //set UseDefaultCookiesParser as false if a website returns invalid cookies format
            //browser.UseDefaultCookiesParser = false;
            WebPage homePage = browser.NavigateToPage(new Uri(manga.IndexUrl));

            IMangaSeeker seeker  = new Kuku(manga);
            var          indexes = seeker.GetIndexes(homePage.Html);

            int i = 0;

            foreach (var index in indexes)
            {
                _ = DownloadIndexAsync(index, i, manga, seeker);
                i++;
            }

            Console.ReadLine();
        }
Пример #18
0
        public static void DownloadRaceList(DateTime date, List <int?> raceIds, RacingPostRacesEntities db)
        {
            string country;
            var    url     = string.Format(@"https://www.racingpost.com/results/{0}/time-order", String.Format("{0:yyyy-MM-dd}", date));
            var    Browser = new ScrapingBrowser();

            Browser.AllowAutoRedirect = true; // Browser has many settings you can access in setup
            Browser.AllowMetaRedirect = true;
            //go to the home page
            //var PageResult = Browser.NavigateToPage(new Uri(url));
            var web = new HtmlWeb();
            var doc = web.Load(url);

            var nodes = doc.QuerySelectorAll("div .rp-timeView__raceInfo").ToList();

            // List<HtmlNode> nodes = doc.QuerySelectorAll("div .rp-timeView__buttons > a").ToList();
            foreach (var item in nodes)
            {
                var courseUrl = baseUrl + item.ChildNodes[1].ChildNodes[1].Attributes["href"].Value;
                var courseId  = Helper.GetIdfromUrl(courseUrl, "https://www.racingpost.com/profile/course/");

                string raceUrl = "";
                if (item.ChildNodes[3].ChildNodes[1].Attributes.Any(a => a.Name == "href"))
                {
                    raceUrl = baseUrl + item.ChildNodes[3].ChildNodes[1].Attributes["href"].Value;
                }
                else
                {
                    continue;
                }


                int?raceId = Convert.ToInt32(raceUrl.Split('/').LastOrDefault());
                if (!raceIds.Any(r => r == raceId))
                {
                    //save url to be scraped
                    ScrapeRace scrapeRace = new ScrapeRace();
                    scrapeRace.Link      = raceUrl;
                    scrapeRace.RaceId    = raceId;
                    scrapeRace.RaceDate  = date;
                    scrapeRace.Scraped   = false;
                    scrapeRace.Required  = true;
                    scrapeRace.CourseUrl = courseUrl;
                    var course = AllCourses.Where(c => c.Id == courseId).FirstOrDefault();
                    if (course == null)
                    {
                        RPCourse c = new RPCourse {
                            Id = courseId, Name = courseUrl.Split('/').LastOrDefault().ToUpper()
                        };
                        db.RPCourses.Add(c);
                        db.SaveChanges();
                        AllCourses.Add(c);
                    }
                    country            = AllCourses.Where(c => c.Id == courseId).FirstOrDefault().Country;
                    scrapeRace.Country = string.IsNullOrEmpty(country) ? "GB" : country;
                    db.ScrapeRaces.Add(scrapeRace);
                    db.SaveChanges();
                }
            }
        }
Пример #19
0
        static void Main(string[] args)
        {
            // setup the browser
            ScrapingBrowser Browser = new ScrapingBrowser();

            Browser.AllowAutoRedirect = true; // Browser has many settings you can access in setup
            Browser.AllowMetaRedirect = true;
            //go to the home page
            WebPage PageResult = Browser.NavigateToPage(new Uri("http://localhost:51621/"));
            // get first piece of data, the page title
            HtmlNode TitleNode = PageResult.Html.CssSelect(".navbar-brand").First();
            string   PageTitle = TitleNode.InnerText;
            // get a list of data from a table
            List <String> Names = new List <string>();
            var           Table = PageResult.Html.CssSelect("#PersonTable").First();

            foreach (var row in Table.SelectNodes("tbody/tr"))
            {
                foreach (var cell in row.SelectNodes("td[1]"))
                {
                    Names.Add(cell.InnerText);
                }
            }
            // find a form and send back data
            PageWebForm form = PageResult.FindFormById("dataForm");

            // assign values to the form fields
            form["UserName"] = "******";
            form["Gender"]   = "M";
            form.Method      = HttpVerb.Post;
            WebPage resultsPage = form.Submit();
        }
Пример #20
0
        static void Scrape(int newestId, int range, string baseUrl = "https://vozforums.com/showthread.php")
        {
            var degreeOfParallelism = Environment.ProcessorCount;
            var tasks    = new Task[degreeOfParallelism];
            int throttle = newestId - range;

            for (int taskNumber = 0; taskNumber < degreeOfParallelism; taskNumber++)
            {
                // capturing taskNumber in lambda wouldn't work correctly
                int taskNumberCopy = taskNumber;

                tasks[taskNumber] = Task.Factory.StartNew(
                    () =>
                {
                    ScrapingBrowser Browser   = new ScrapingBrowser();
                    Browser.AllowAutoRedirect = true;     // Browser has many settings you can access in setup
                    Browser.AllowMetaRedirect = true;
                    WebPage PageResult;

                    var max = throttle + (newestId - throttle) * (taskNumberCopy + 1) / degreeOfParallelism;
                    var min = throttle + (newestId - throttle) * (taskNumberCopy) / degreeOfParallelism;
                    logger.InfoFormat("max-min:{0}-{1}", max, min);
                    for (int i = min; i < max; i++)
                    {
                        var url    = $"{baseUrl}?t={i}";
                        PageResult = Browser.NavigateToPage(new Uri(url));
                        RemoveComment(PageResult.Html);
                        var navbars = PageResult.Html.CssSelect(".navbar");

                        string title = "";
                        if (navbars != null && navbars.Any())
                        {
                            title = navbars.Last().InnerText.Trim();
                            logger.Warn(title);
                        }

                        var rawPosts = PageResult.Html.CssSelect(".voz-post-message");

                        List <Post> Posts = new List <Post>();
                        foreach (var rawPost in rawPosts)
                        {
                            Posts.Add(new Post(rawPost.InnerHtml));
                        }

                        if (!string.IsNullOrEmpty(title) && Posts.Any())
                        {
                            repository.Create <Advert>(new Advert()
                            {
                                Title = title,
                                URL   = "https://vozforums.com/showthread.php?t=6134837",
                                Posts = Posts
                            });
                        }
                    }
                });
            }

            Task.WaitAll(tasks);
        }
Пример #21
0
        /// <summary>
        /// Will get you Guild: name, char amount, fame, most active on and description.
        /// </summary>
        /// <param name="guildName"></param>
        /// <param name="guild"></param>
        /// <returns></returns>
        public static bool GetGuildSummary(string guildName, Guild guild)
        {
            guildName = guildName.Replace(" ", "%20");
            result    = false;
            ScrapingBrowser browser = new ScrapingBrowser();

            browser.AllowAutoRedirect = true;
            browser.AllowMetaRedirect = true;
            try
            {
                WebPage  Main     = browser.NavigateToPage(new Uri("https://www.realmeye.com/guild/" + guildName));
                HtmlNode Username = Main.Html.CssSelect(".entity-name").First();
                guild.Name = Username.InnerText;
                try
                {
                    var Table1 = Main.Html.CssSelect("#d").First();
                    guild.Desc1 = Table1.FirstChild.InnerText;
                    guild.Desc2 = Table1.FirstChild.NextSibling.InnerText;
                    guild.Desc3 = Table1.FirstChild.NextSibling.NextSibling.InnerText;
                }
                catch
                {
                    guild.Desc1 = "Private";
                    guild.Desc2 = "Private";
                    guild.Desc3 = "Private";
                }

                var Table2 = Main.Html.CssSelect(".summary").First();

                foreach (var row in Table2.SelectNodes("tr"))
                {
                    foreach (var cell in row.SelectNodes("td[1]"))
                    {
                        if (cell.InnerText == "Members")
                        {
                            guild.MemberCount = cell.NextSibling.InnerText;
                        }
                        else if (cell.InnerText == "Characters")
                        {
                            guild.Chars = cell.NextSibling.InnerText;
                        }
                        else if (cell.InnerText == "Fame")
                        {
                            guild.Fame = cell.NextSibling.InnerText;
                        }
                        else if (cell.InnerText == "Most active on")
                        {
                            guild.MostActiveOn = cell.NextSibling.InnerText;
                        }
                    }
                }
                result = true;
            }
            catch (Exception)
            {
                guild.Name = "Private";
            }
            return(result);
        }
Пример #22
0
        static void Main(string[] args)
        {
            Neo             db          = new Neo();
            ScrapingBrowser browser     = new ScrapingBrowser();
            List <HtmlNode> memberLinks = new List <HtmlNode>();

            string startPoint = "http://www.meetup.com/Chester-Devs/members/?offset={0}&desc=1&sort=social_sort";

            for (int pages = 1; pages <= 12; pages++)
            {
                WebPage memberList = browser.NavigateToPage(new Uri(String.Format(startPoint, ((pages - 1) * 20).ToString())));
                memberLinks.AddRange(memberList.Html.CssSelect("a.memName").ToList());
            }

            List <Person> people = new List <Person>();

            people.AddRange(memberLinks.Select <HtmlNode, Person>(ml => new Person()
            {
                Name = ml.InnerText, LinkToProfile = ml.Attributes["href"].Value
            }).ToList());

            foreach (Person person in people)
            {
                WebPage memberPage = browser.NavigateToPage(new Uri(person.LinkToProfile));
                db.Create <Person>(person);

                Location location = new Location()
                {
                    Name = memberPage.Html.CssSelect("span.locality").Select <HtmlNode, string>(l => l.InnerText).First()
                };
                db.Create <Location>(location);
                db.RelatePersonToLocation(person.UniqueId, location.Name);

                List <Interest> interests = new List <Interest>();
                interests.AddRange(memberPage.Html.CssSelect("ul#memberTopicList > li.D_group > div > a.topic-widget").Select <HtmlNode, Interest>(i => new Interest()
                {
                    Name = i.InnerText
                }).ToList());
                interests.ForEach(i => db.Create <Interest>(i));
                interests.ForEach(i => db.RelatePersonToInterest(person.UniqueId, i.Name));

                List <Meetup> meetups = new List <Meetup>();
                meetups.AddRange(memberPage.Html.CssSelect("div.D_name").Select <HtmlNode, Meetup>(g => new Meetup()
                {
                    Name = g.InnerText
                }).ToList());

                if (meetups.Where(g => g.Name == "Chester Devs").Count() == 0)
                {
                    meetups.Add(new Meetup()
                    {
                        Name = "Chester Devs"
                    });
                }

                meetups.ForEach(g => db.Create <Meetup>(g));
                meetups.ForEach(g => db.RelatePersonToMeetup(person.UniqueId, g.Name));
            }
        }
Пример #23
0
        private static void SinglePassScrapeTemplate(ScrapingBrowser scrapingBrowser, ScrapeEvent scrapingEvent, HtmlNode scraperPointer, TemplateInstructions sourceTemplate, System.Collections.ObjectModel.ObservableCollection <TemplateInstructions> allTemplates)
        {
            var absoluteUri = scrapingBrowser.Referer.AbsoluteUri;
            var rootUri     = absoluteUri.Substring(0, absoluteUri.IndexOf('/', 8));

            foreach (var templateField in sourceTemplate.TemplateFields)
            {
                if (templateField.OrderedScrapingSteps.Count() == 0)
                {
                    continue;
                }

                try
                {
                    PerformScrapingForTemplateField(scraperPointer, templateField, scrapingEvent, rootUri);
                }
                catch (Exception ex)
                {
                    Console.WriteLine($"\t\"{templateField.Name}\" field missing from entry.");
                }

                // Perform navigation action if the last step of a template's scraping instructions dictates it.
                var lastStep = templateField.OrderedScrapingSteps.Last();
                if (lastStep.ActionType == ScrapingActionType.TravelToChildTemplate)
                {
                    var jsonSerializer = new JavaScriptSerializer();

                    var travelToChildTemplateParams = new TravelToChildTemplateParams();
                    travelToChildTemplateParams = jsonSerializer
                                                  .Deserialize <TravelToChildTemplateParams>(lastStep.Parameters);

                    // Need child template instructions.
                    var childTemplateId = travelToChildTemplateParams.Child;
                    var childTemplate   = allTemplates
                                          .First(t => t.TemplateId == childTemplateId && !t.IsTopLevel);

                    int    targetFieldId;
                    var    sourceIsNumericId = int.TryParse(travelToChildTemplateParams.Source, out targetFieldId);
                    string targetUri;

                    // The target URI was saved as a temporary field.
                    if (travelToChildTemplateParams.IsFromTemporaryField)
                    {
                        targetUri = scrapingEvent.Records.Last().TemporaryFieldIdToValueDictionary[targetFieldId.ToString()];
                    }

                    // The target URI is a standard, non-temporary field.
                    else
                    {
                        targetUri = scrapingEvent.Records.Last().TargetFieldIdToValueDictionary[targetFieldId.ToString()];
                    }

                    // Set record lock so that the same record is used.
                    scrapingEvent.LockOnCurrentRecord = true;

                    ScrapeFromTemplate(scrapingBrowser, scrapingEvent, rootUri + targetUri, childTemplate, allTemplates);
                }
            }
        }
Пример #24
0
        public override Task <List <Article> > GetArticles(string url)
        {
            _log.Info("DvCvitMediteranaScraper scraping " + url);

            return(Task.Run(() =>
            {
                var articles = new List <Article>();

                var browser = new ScrapingBrowser
                {
                    AllowAutoRedirect = true,
                    AllowMetaRedirect = true,
                    Encoding = Encoding.UTF8
                };
                var homePage = browser.NavigateToPage(new Uri(url));

                var pageArticles = homePage.Html.CssSelect("div.post");
                foreach (var pageArticle in pageArticles)
                {
                    var title = pageArticle.CssSelect("h2 > a").Single().InnerText.Replace("&nbsp;", " ").Replace("&quot;", "\"");

                    var shortText = pageArticle.CssSelect("div.post-text").Single().InnerText;

                    var time = pageArticle.CssSelect("div.post-text > p").First().InnerText;
                    var firstChar = time.Trim().ToCharArray().ElementAt(0);
                    var date = DateTime.Now;
                    if (!string.IsNullOrWhiteSpace(time) && char.IsDigit(firstChar))
                    {
                        date = Convert.ToDateTime(time);
                    }

                    var link = pageArticle.CssSelect("h2 > a").Single().Attributes["href"].Value;

                    var text = "";
                    try
                    {
                        var homedetailsPage = browser.NavigateToPage(new Uri(link));
                        var psDetails = homedetailsPage.Html.CssSelect("div.post");
                        text = psDetails.Aggregate(text,
                                                   (current, psDetail) => current + (psDetail.InnerText + Environment.NewLine)).Replace("&nbsp;", " ");
                    }
                    catch (Exception ex)
                    {
                        _log.Error("NzjzScraper error: " + ex.Message + Environment.NewLine + ex.StackTrace);
                    }

                    if (string.IsNullOrWhiteSpace(text))
                    {
                        text = link;
                    }

                    _log.Info("DvCvitMediteranaScraper Creating " + title);

                    CreateArticle(text, articles, title, shortText, link, ArticleType.DvCvitMediterana, date);
                }

                return articles;
            }));
        }
        public List <HeadingDto> GetTextofHeadingTags(string url)
        {
            List <HeadingDto> headingDtos = new List <HeadingDto>();
            ScrapingBrowser   Browser     = new ScrapingBrowser();

            Browser.AllowAutoRedirect = true;
            Browser.AllowMetaRedirect = true;

            WebPage  PageResult = Browser.NavigateToPage(new Uri(url));
            HtmlNode TitleNode  = PageResult.Html;

            string       PageTitle = TitleNode.InnerHtml;
            HtmlDocument doc       = new HtmlDocument();

            doc.LoadHtml(PageTitle);

            if (doc.DocumentNode.SelectNodes("//h1") != null)
            {
                foreach (HtmlNode node in doc.DocumentNode.SelectNodes("//h1"))
                {
                    HeadingDto headingDto = new HeadingDto()
                    {
                        Content     = node.InnerText,
                        HeadingType = "h1"
                    };

                    headingDtos.Add(headingDto);
                }
            }

            if (doc.DocumentNode.SelectNodes("//h2") != null)
            {
                foreach (HtmlNode node in doc.DocumentNode.SelectNodes("//h2"))
                {
                    HeadingDto headingDto = new HeadingDto()
                    {
                        Content     = node.InnerText,
                        HeadingType = "h2"
                    };

                    headingDtos.Add(headingDto);
                }
            }
            if (doc.DocumentNode.SelectNodes("//h3") != null)
            {
                foreach (HtmlNode node in doc.DocumentNode.SelectNodes("//h3"))
                {
                    HeadingDto headingDto = new HeadingDto()
                    {
                        Content     = node.InnerText,
                        HeadingType = "h3"
                    };

                    headingDtos.Add(headingDto);
                }
            }

            return(headingDtos);
        }
Пример #26
0
 public KinoNewsParser()
 {
     System.Text.Encoding.RegisterProvider(System.Text.CodePagesEncodingProvider.Instance);
     web = new ScrapingBrowser();
     web.IgnoreCookies = false;
     web.Encoding      = System.Text.CodePagesEncodingProvider.Instance.GetEncoding(1251);
     newsItems         = new List <NewsItemInfo>();
 }
Пример #27
0
        public void ScrapingCrawler()
        {
            Uri uri      = new Uri(initurl);
            var browser1 = new ScrapingBrowser();

            result = browser1.DownloadString(uri);
            doHtml();
        }
        public override Producto[] getProductos()
        {
            browser = new ScrapingBrowser();
            WebPage homePage = browser.NavigateToPage(new Uri(url));

            HtmlNode[] components = homePage.Html.CssSelect("li.products__item div.products__wrap.clearfix").ToArray();
            return(Producto.assemble(components, getNombre, getPrecio));
        }
Пример #29
0
        public static string NavigateTo(this ScrapingBrowser scrapingBrowser, string url, NameValueCollection data,
                                        Action <ScrapingBrowser, string> action, HttpVerb httpVerb = HttpVerb.Get)
        {
            var html = scrapingBrowser.NavigateTo(new Uri(url), httpVerb, data);

            action.Invoke(scrapingBrowser, html);
            return(html);
        }
Пример #30
0
 private static ScrapingBrowser BrowserInstance()
 {
     if (_browser == null)
     {
         _browser = new ScrapingBrowser();
     }
     return(_browser);
 }
Пример #31
0
        static void Main(string[] args)
        {
            var b = new ScrapingBrowser();

            WebPage PageResult = b.NavigateToPage(new Uri("https://www.autotrader.co.uk/car-search?sort=price-desc&radius=1500&postcode=e148dw&onesearchad=Used&onesearchad=Nearly%20New&onesearchad=New&make=NISSAN&model=JUKE&aggregatedTrim=Nismo%20RS&page=1"));
            //HtmlNode countNode = PageResult.Html.SelectNodes("//html/body/main/section[2]/div[1]/header/nav/ul/li[3]").FirstOrDefault();
            var nodes = PageResult.Html.SelectNodes("//*[@id=\"main-content\"]/div[1]/header/nav/ul/li[3]");

            //*[@id="main-content"]/div[1]/header/nav/ul/li[3]
            string count   = nodes.First().InnerText;
            int    current = int.Parse(count.Split(' ')[1]);
            int    total   = int.Parse(count.Split(' ')[3]);


            using (var csv = File.AppendText(@".\cars.csv"))
            {
                int id = 0;
                csv.WriteLine("id,description,price,mileage,year,age,transmission,size,power,fuel");
                for (int page = 2; page <= total; page++)
                {
                    var results = PageResult.Html.SelectNodes("//html/body/main/section[2]/div[1]/ul/li");

                    foreach (var result in results)
                    {
                        if (result.InnerText.StartsWith("\nFeatrured"))
                        {
                            continue;
                        }

                        if (result.SelectNodes("article/section[1]/div") == null)
                        {
                            continue;
                        }

                        var also = result.SelectNodes("span");
                        if (also != null && also.First().InnerText.Contains("also like"))
                        {
                            continue;
                        }

                        var    info         = result.SelectNodes("article/section[1]/div").First();
                        var    price        = result.SelectNodes("article/section[2]/a/div").First().InnerText.Trim('?').Replace(",", "");
                        string description  = info.SelectNodes("h2/a").First().InnerText;
                        var    subInfo      = info.SelectNodes("ul/li");
                        string mileage      = subInfo[2].InnerText.Replace(" miles", "").Replace(",", "");
                        string year         = subInfo[0].InnerText.Substring(0, 4);
                        int    age          = DateTime.Now.Year - int.Parse(year);
                        string size         = subInfo[3].InnerText.Replace("L", "");
                        string power        = subInfo[4].InnerText.Replace("bhp", "");
                        string transmission = subInfo[5].InnerText;
                        string fuel         = subInfo[6].InnerText;
                        csv.WriteLine($"{id},{description},{price},{mileage},{year},{age},{transmission},{size},{power},{fuel}");
                        id++;
                    }
                    PageResult = b.NavigateToPage(new Uri($"https://www.autotrader.co.uk/car-search?sort=price-desc&radius=1500&postcode=e148dw&onesearchad=Used&onesearchad=Nearly%20New&onesearchad=New&make=NISSAN&model=JUKE&aggregatedTrim=Nismo%20RS&page={page}"));
                }
            }
        }
Пример #32
0
 public static HtmlDocument GetHtmlDocument(string url, Encoding encoding = null)
 {
     var uri = new Uri(url);
     var b = new ScrapingBrowser();
     if (encoding != null)
         b.Encoding = fromEncoding;
     var htmlStr = b.DownloadString(uri);
     var doc = new HtmlDocument();
     doc.LoadHtml(htmlStr);
     return doc;
 }
Пример #33
0
        static void Main(string[] args)
        {
            var uri = new Uri("http://www.kuaidi100.com/");
            var browser = new ScrapingBrowser();
            var htmlData = browser.DownloadString(uri);
            var htmlDocument = new HtmlAgilityPack.HtmlDocument();
            htmlDocument.LoadHtml(htmlData);
            HtmlNode html = htmlDocument.DocumentNode;
            var nodeList = html.CssSelect("#companyList dl a");

            //////////抓取快递100提供的可查询的快递公司数据
            IList<KuaiDiModel> listKD = new List<KuaiDiModel>();
            foreach (var htmlNode in nodeList)
            {
                string data_code = htmlNode.Attributes["data-code"].Value;
                var children = htmlNode.CssSelect("span");
                string data_name = "";
                foreach (var spanNode in children)
                {
                    data_name = spanNode.InnerHtml;
                }
                KuaiDiModel itemKD = new KuaiDiModel();
                itemKD.Code = data_code;
                itemKD.Name = data_name;

                if (listKD.Where(x => x.Code == data_code).Count() == 0)   ///////////在itemKD中插入数据前判断code是否已存在
                {
                    listKD.Add(itemKD);
                }
            }
            foreach (var item in listKD)
            {
                Console.WriteLine(item.Code + "|" + item.Name);
            }

            ////////测试查询   type为快递公司编码 postid为运单号
            string uuu = "http://www.kuaidi100.com/query?type=yunda&postid=1900491153505";
            var uri2 = new Uri(uuu);
            string kkkk = browser.DownloadString(uri2);
            KuaiDiLog dd = Util.ParseFromJson<KuaiDiLog>(kkkk);
            Console.ReadLine();
    }
Пример #34
0
        public async Task<int> ExtractCatalogPageCount(string url)
        {
            int pageCount = 1;
            var uri = new Uri(url);
            var browser1 = new ScrapingBrowser();
            browser1.Encoding=Encoding.UTF8;
            var html1 = browser1.DownloadString(uri);
            var doc = new HtmlDocument();
            doc.LoadHtml(html1);
            var html = doc.DocumentNode;

            var pagers =await Task.Run(()=>html.CssSelect("div.pager")).ConfigureAwait(false);

            var htmlNodes = pagers as IList<HtmlNode> ?? pagers.ToList();

            if (htmlNodes.Any())
            {
                var mp = Regex.Match(htmlNodes.First().InnerText, @"共(\d+)页");
                if (mp.Success) pageCount= int.Parse(mp.Groups[1].Value);
            }
            else
            {
                var pagerLinks = html.CssSelect("div#pager>a").Select(t => t.InnerText).ToList();
                if (!int.TryParse(pagerLinks.Last(),out pageCount))
                {
                    pagerLinks.RemoveAt(pagerLinks.Count-1);
                    pagerLinks.Last(t => int.TryParse(t, out pageCount));
                    pageCount = await ExtractCatalogPageCount(string.Format(CatalogsUrlTemplate,_blogName, pageCount));
                }

            }

          
           

            return pageCount;
        }
Пример #35
0
        private async void OnGetUrls()
        {
            var cnblog = new CnblogProcess();
            var browser = new ScrapingBrowser();
            var htmlDocument = new HtmlDocument();
            var html = browser.DownloadString(new Uri("http://news.baidu.com"));
            htmlDocument.LoadHtml(html);
            var links = htmlDocument.DocumentNode.Descendants("a")
                .Where(x => x.Attributes.Contains("href"));

            var progress = new Progress<DownloadStringTaskAsyncExProgress>();
            Content = " ";
            var i = 0;

            progress.ProgressChanged += (s, e) =>
            {
                Content += e.Text + Environment.NewLine;
                ProgressValue = (double) i++/links.Count();
            };

            await OutputLinks(links, progress).ConfigureAwait(false);
        }
Пример #36
0
        private static void LoadAPageOfPlayers(ScrapingBrowser browser, HtmlNode rootNode, ConcurrentBag<Player> players)
        {
            var tbody = rootNode.CssSelect("#result > tbody").SingleOrDefault();
            if (tbody == null)
            {
                return;
            }

            var childRows = tbody.ChildNodes.Skip(1).ToList();
            LoadPlayers(players, childRows);

            var searchResults = tbody.OwnerDocument.DocumentNode.CssSelect("#searchResults").Single();
            if (searchResults.ChildNodes.Any() == false)
            {
                return;
            }

            var navigationRow = searchResults.ChildNodes[1];
            var nextButton = navigationRow.ChildNodes.SingleOrDefault(n => n.InnerText.Trim() == "next");
            if (nextButton != null)
            {
                var value = nextButton.Attributes.Single(a => a.Name == "href").Value;
                var uriString = BaseUrl + HttpUtility.HtmlDecode(value);
                var page = browser.NavigateToPage(new Uri(uriString));

                LoadAPageOfPlayers(browser, page.Html, players);
            }
        }
Пример #37
0
        public List<string[]> ReadHtmlTable(string divId)
        {
            var browser = new ScrapingBrowser();
            var html = browser.DownloadString(new Uri(connectionURL));
            var document = new HtmlDocument();
            var table = new List<string[]>();

            document.LoadHtml(html);

            try
            {
                var tableHtml = document.DocumentNode.CssSelect("div#" + divId).CssSelect("Table").First();
                table.Add(tableHtml.CssSelect("th").Select(h => h.InnerText).ToArray());
                tableHtml.CssSelect("tbody").CssSelect("tr").Each((row, i) => table.Add(row.CssSelect("td").Select(d => d.InnerText).ToArray()));
            }
            catch
            {
            }

            return table;
        }
Пример #38
0
 private void OnLoadUrl()
 {
     var uri = new Uri(URL);
     var browser1 = new ScrapingBrowser();
     var html1 = browser1.DownloadString(uri);
     var doc = new HtmlDocument();
     doc.LoadHtml(html1);
     Content = doc.DocumentNode.InnerHtml;
 }
 public BlogProcessBase()
 {
     _scrapyBrowser = new ScrapingBrowser();
 }
Пример #40
0
        public IEnumerable<Dictionary<string, object>> ReadNASDAQFRHtmlTable()
        {
            var browser = new ScrapingBrowser();
            var html = browser.DownloadString(new Uri(connectionURL));
            var document = new HtmlDocument();
            var table = new List<Dictionary<string, object>>();

            document.LoadHtml(html);

            try
            {
                var tableHtml = document.DocumentNode.CssSelect("table.ipos").First().CssSelect("tr");
                foreach (var rowHtml in tableHtml)
                {
                    var dateHtml = rowHtml.CssSelect("td.dkbluert").Where(d => d.InnerText != "&nbsp;");
                    var dateKeyHtml = rowHtml.CssSelect("td.dkbluelt").Where(d => d.InnerText != "&nbsp;");
                    string key = dateKeyHtml.Any() ? dateKeyHtml.First().InnerText : "ReportDate";
                    if (key == "Quarter:")
                        continue;
                    foreach (var colDataHtml in dateHtml)
                    {
                        var newRport = new Dictionary<string, object>();
                        newRport["End Date"] = colDataHtml.InnerText;
                        table.Add(newRport);
                    }

                    var numHtml = rowHtml.CssSelect("td.fundnum").Where(d => d.InnerText != "&nbsp;");
                    var itemHtml1 = rowHtml.CssSelect("td.body1").Where(d => d.InnerText != "&nbsp;");
                    var itemHtml2 = rowHtml.CssSelect("td.indent").Where(d => d.InnerText != "&nbsp;");
                    if (itemHtml1.Any() || itemHtml2.Any())
                    {
                        string item = itemHtml1.Any() ? itemHtml1.First().InnerText : itemHtml2.First().InnerText;
                        table = table.Select(t => { t[item] = null; return t; }).ToList();

                        int i = 0;
                        foreach (var colHtml in numHtml)
                        {
                            string number = colHtml.InnerText.Replace("\n", "").Replace("\t", "").Replace("\r", "");
                            var cf = new System.Globalization.CultureInfo("en-US");
                            table[i][item] = Decimal.Parse(number, System.Globalization.NumberStyles.Currency, cf)*1000;
                            i++;

                        }

                    }
                }
            }
            catch
            {
            }

            return table;
        }
Пример #41
0
        private async Task<List<Catalog>> ExtractCatalogsFromWebPage(string url)
        {
            List<Catalog> reusltCatalogs = new List<Catalog>();

            var uri = new Uri(url);
            var browser1 = new ScrapingBrowser();
            browser1.Encoding = Encoding.UTF8;
            var html1 = browser1.DownloadString(uri);
          
            var doc = new HtmlDocument();
            doc.LoadHtml(html1);
            var html = doc.DocumentNode;

            foreach (var script in doc.DocumentNode.Descendants("script").ToArray())
            {
                script.Remove();
            }
            foreach (var style in doc.DocumentNode.Descendants("style").ToArray())
            {
                style.Remove();
            }
            foreach (var comment in doc.DocumentNode.SelectNodes("//comment()").ToArray())
            {
                comment.Remove();
            }

            var days = html.CssSelect("div.day");
            if (!days.Any())
            {
                days = html.CssSelect("div#container >div#wrapper >div#content");
            }

            foreach (var day in days)
            {
              
                var catalog = new Catalog();
                catalog.IsChecked = true;
                var title = day.CssSelect("div.dayTitle").FirstOrDefault();
                if(title!=null)
                {                 
                catalog.Title = title.InnerText.ClearNotWords();
                var atricles = day.CssSelect("div.postTitle");
                 foreach (var atricle in atricles)
                 {
                     var article = new Article();
                     article.Title = atricle.InnerText.ClearNotWords();
                     var articleTitleEl = atricle.CssSelect("a.postTitle2");
                     article.URL = articleTitleEl.First().Attributes["href"].Value;
                     catalog.Articles.Add(article);
                 }
                }
                else
                {
                    catalog.Title = "CataLog"+DateTime.Now.ToShortTimeString();

                    var atricles = day.CssSelect("div.post");
                    foreach (var atricle in atricles)
                    {
                        var article = new Article();
                      
                        var articleTitleEl = atricle.CssSelect("a.PostTitle");
                        article.Title = articleTitleEl.First().InnerText.ClearNotWords();
                        article.URL = articleTitleEl.First().Attributes["href"].Value;
                        catalog.Articles.Add(article);
                    }
                }

                reusltCatalogs.Add(catalog);
                
              
            }

            return reusltCatalogs;
        }
Пример #42
0
        private static void example3()
        {
            var b = new ScrapingBrowser();

            var baseLink = "http://webscraper.io";

            WebPage PageResult = b.NavigateToPage(new Uri(baseLink + "/test-sites/e-commerce/allinone"));
            var node = PageResult.Html.CssSelect(".category-link").First();

            //string text = node.InnerText.Trim();
            //Console.WriteLine(text);

            var link = node.GetAttributeValue("href");
            PageResult = b.NavigateToPage(new Uri(baseLink + "/" + link));

            var nodes = PageResult.Html.CssSelect(".price");
            foreach (var n in nodes)
            {
                Console.WriteLine(WebUtility.HtmlDecode(n.InnerText.Trim()));
                Console.WriteLine("--------------------------------------------------------------------------");
            }
        }
Пример #43
0
        private static IEnumerable<Player> GetPlayersFromWeb()
        {
            var browser = new ScrapingBrowser();
            var htmlWeb = new HtmlWeb();
            var players = new ConcurrentBag<Player>();

            Parallel.For('A', 'Z' + 1, c =>
                {
                    Console.Write("\rLoading players whose last names begins with '{0}'.", Convert.ToChar(c));
                    var startUrl = String.Format("/players/search?category=lastName&filter={0}&playerType=current",
                                                 Convert.ToChar(c));
                    var doc = htmlWeb.Load(BaseUrl + startUrl);
                    LoadAPageOfPlayers(browser, doc.DocumentNode, players);
                });

            //Console.WriteLine(players.Count());
            //Console.ReadLine();

            //players.OrderBy(p => p.Name).ToList().ForEach(Console.WriteLine);
            return players;
        }
Пример #44
-2
 public static Match[] GetFractionalOdds()
 {
     ScrapingBrowser browser = new ScrapingBrowser();
     Uri lolUri = new Uri(lolUrl);
     WebPage page = browser.NavigateToPage(lolUri);
     HtmlNode table = ParsePage(page.Html);
     TableParser tableParser = new TableParser(table.ChildNodes);
     Match[] matches = tableParser.Parse();
     return matches;
 }
Пример #45
-2
        public void TestUsingForm()
        {
            ScrapingBrowser browser = new ScrapingBrowser();

            //set UseDefaultCookiesParser as false if a website returns invalid cookies format
            //browser.UseDefaultCookiesParser = false;

            WebPage homePage = browser.NavigateToPage(new Uri("http://www.bing.com/"));

            PageWebForm form = homePage.FindFormById("sb_form");
            form["q"] = "scrapysharp";
            form.Method = HttpVerb.Get;
            WebPage resultsPage = form.Submit();

            HtmlNode[] resultsLinks = resultsPage.Html.CssSelect("div.b_content h2 a").ToArray();

            Assert.True(resultsLinks.Any());
            //WebPage blogPage = resultsPage.FindLinks(By.Text("romcyber blog | Just another WordPress site")).Single().Click();

        }
Пример #46
-3
        static void Main(string[] args)
        {
            Log.Logger = new LoggerConfiguration()
                .ReadFrom.AppSettings()
                .CreateLogger();
            var log = Log.Logger.ForContext<Crufty.CruftyConsole>();

            // get list of all courts to process
            var client = new MongoClient(ConfigurationManager.AppSettings["MongoServer"]);
            var database = client.GetDatabase("CourtCruft");
            var collection = database.GetCollection<CourtWebsite>("CourtWebsites");
            List<CourtWebsite> documents = new List<CourtWebsite>();
            Task.Run(async () =>
            {
                documents = await collection.Find(new BsonDocument()).ToListAsync();
            }).Wait();

            // loop over list of courts and scrape them.
            foreach (var courtWebsite in documents)
            {
                try
                {
                    // wrap in error handling so we catch 404s and "network down"
                    ScrapingBrowser browser = new ScrapingBrowser();
                    WebPage homePage = browser.NavigateToPage(new Uri(courtWebsite.Url));

                    //PageWebForm form = homePage.FindFormById("sb_form");
                    //form["q"] = "scrapysharp";
                    //form.Method = HttpVerb.Get;
                    //WebPage resultsPage = form.Submit();

                    // if no xpath filter, just grab the whole page
                    var currentPage = (String.IsNullOrEmpty(courtWebsite.SelectionXPathString)) ?
                        homePage.Html.InnerHtml
                        :homePage.Html.SelectSingleNode(courtWebsite.SelectionXPathString).InnerHtml;

                    if (String.IsNullOrWhiteSpace(currentPage))
                    {
                        log.Error("Unable to find {XPath} in {Url}", courtWebsite.SelectionXPathString, courtWebsite.Url);
                        continue;
                    }

                    if (courtWebsite.NewPageHtml != currentPage)
                    {
                        HtmlDiff diffHelper = new HtmlDiff(courtWebsite.NewPageHtml, currentPage);
                        courtWebsite.DiffedHtml = diffHelper.Build().Insert(0, "<style>ins {background-color: #cfc;text-decoration: none;} del {    color: #999;    background-color:#FEC8C8;</style>");
                        courtWebsite.OldPageHtml = courtWebsite.NewPageHtml;
                        courtWebsite.NewPageHtml = currentPage;
                        courtWebsite.LastChangedDateTime = DateTime.Now;
                        courtWebsite.Checked = false;

                        log.Information("Changes found for {CourtName}", courtWebsite.CourtName);
                    }

                    courtWebsite.LastRunDateTime = DateTime.Now;

                    var filter = Builders<CourtWebsite>.Filter.Eq(s => s.Id, courtWebsite.Id);
                    Task.Run(async () =>
                    {
                        await collection.ReplaceOneAsync(filter, courtWebsite);
                    }).Wait();

                    log.Verbose("Successfully scraped site {CourtName}", courtWebsite.CourtName);
                }
                catch (WebException exception)
                {
                    log.Error(exception, "Web Exception for {CourtName} (check the url) {ExceptionMessage} {url}", courtWebsite.CourtName, exception.Message, courtWebsite.Url);
                }
                catch (Exception exception)
                {
                    log.With("CourtWebsite", courtWebsite)
                        .Error(exception, "General Exception for {CourtName} {ExceptionMessage}", courtWebsite.CourtName, exception.Message);
                }
            }
        }