Beispiel #1
0
        //private void ScrapAmbitionCompanies()
        //{
        //    var companyList = new ConcurrentBag<CompanyModel>();

        //    var companyUrls = new List<string>();
        //    var scrapped = new List<string>();

        //    if (File.Exists(@"C:\Users\Ashish\Desktop\CompanyUrls.txt"))
        //    {
        //        var allUrls = File.ReadAllLines(@"C:\Users\Ashish\Desktop\CompanyUrls.txt").ToList();
        //        var scrappedUrls = new List<string>();

        //        if (File.Exists(@"C:\Users\Ashish\Desktop\Companies(Ambition)ScrappedUrls.txt"))
        //            scrappedUrls = File.ReadAllLines(@"C:\Users\Ashish\Desktop\Companies(Ambition)ScrappedUrls.txt").ToList();

        //        companyUrls = allUrls.Except(scrappedUrls).ToList();
        //    }
        //    else
        //    {
        //        Parallel.For(1, 8306, (i) =>
        //        {
        //            var browser = new HtmlWeb();
        //            var url = "https://www.glassdoor.co.in/Reviews/india-reviews-SRCH_IL.0,5_IN115.htm";
        //            if (i > 1)
        //            {
        //                url = "https://www.glassdoor.co.in/Reviews/india-reviews-SRCH_IL.0,5_IN115_IP" + i + ".htm";
        //            }

        //            var success = false;
        //            do
        //            {
        //                try
        //                {
        //                    var comapniesDoc = browser.Load(url);
        //                    var companies = comapniesDoc.DocumentNode.SelectNodes("//div[contains(@class,'eiHdrModule module snug')]");
        //                    foreach (var company in companies)
        //                    {
        //                        var companyLink = company.SelectSingleNode(".//a[contains(@class,'sqLogoLink')]");
        //                        companyUrls.Add("https://www.glassdoor.co.in" + companyLink.Attributes["href"].Value);
        //                    }
        //                    success = true;
        //                }
        //                catch (Exception e)
        //                {
        //                    Console.WriteLine("Error Occured, Retrying in 1 sec ...");
        //                    Thread.Sleep(1000);
        //                }
        //            }
        //            while (!success);
        //        });

        //        File.WriteAllLines(@"C:\Users\Ashish\Desktop\CompanyUrls.txt", companyUrls);
        //    }

        //    Parallel.ForEach(companyUrls, (url) =>
        //    {
        //        var success = false;
        //        var browser = new HtmlWeb();
        //        var company = new CompanyModel();
        //        HtmlDocument doc = null;
        //        do
        //        {
        //            try
        //            {
        //                doc = browser.Load(url);
        //                var companyData = doc.DocumentNode.SelectNodes("//div[contains(@class,'infoEntity')]");

        //                company.Name = doc.DocumentNode.SelectSingleNode("//h1[contains(@class,'strong tightAll')]").InnerText.Trim();
        //                var logo = doc.DocumentNode.SelectSingleNode("//span[contains(@class,'lgSqLogo')]/img")?.Attributes["src"].Value.Trim();

        //                if (!string.IsNullOrWhiteSpace(logo))
        //                {
        //                    company.Logo = company.Name.Replace(" ", "-").Replace("|", "") + Path.GetExtension(logo);
        //                    company.LogoUrl = logo;

        //                    var fileName = @"C:\Users\Ashish\Desktop\LogosAmbition\" + company.Logo;
        //                    if (!File.Exists(fileName))
        //                    {
        //                        using (var client = new WebClient())
        //                        {
        //                            client.DownloadFile(logo, @"C:\Users\Ashish\Desktop\LogosAmbition\" + company.Logo);
        //                        }
        //                    }
        //                }

        //                foreach (var data in companyData)
        //                {
        //                    if (data.SelectSingleNode("./label").InnerText == "Website")
        //                    {
        //                        company.Website = data.SelectSingleNode("./span").InnerText.Trim();
        //                    }
        //                    else if (data.SelectSingleNode("./label").InnerText == "Headquarters")
        //                    {
        //                        company.Headquarters = data.SelectSingleNode("./span").InnerText.Trim();
        //                    }
        //                    else if (data.SelectSingleNode("./label").InnerText == "Size")
        //                    {
        //                        company.Size = data.SelectSingleNode("./span").InnerText.Trim();
        //                    }
        //                    else if (data.SelectSingleNode("./label").InnerText == "Founded")
        //                    {
        //                        company.Founded = data.SelectSingleNode("./span").InnerText.Trim();
        //                    }
        //                    else if (data.SelectSingleNode("./label").InnerText == "Industry")
        //                    {
        //                        company.Industry = data.SelectSingleNode("./span").InnerText.Trim();
        //                    }
        //                }
        //                success = true;
        //            }
        //            catch (Exception e)
        //            {
        //                Console.WriteLine("Error Occured, Retrying in 2 sec ...");
        //                Thread.Sleep(2000);
        //            }
        //        }
        //        while (!success);

        //        company.Description = doc.DocumentNode.SelectSingleNode("//div[contains(@class,'margTop empDescription')]")?.Attributes["data-full"].Value.Trim();

        //        lock (lockOb)
        //        {
        //            companyList.Add(company);
        //            scrapped.Add(url);

        //            if (companyList.Count >= 2622)
        //            {
        //                var json = JsonConvert.SerializeObject(companyList);
        //                File.WriteAllText(@"C:\Users\Ashish\Desktop\Companies(Ambition)" + DateTime.Now.Ticks + ".json", json);
        //                File.AppendAllLines(@"C:\Users\Ashish\Desktop\Companies(Ambition)ScrappedUrls.txt", scrapped);

        //                companyList.Clear();
        //                scrapped.Clear();
        //            }
        //        }
        //        Console.WriteLine("Scrapped: " + url);
        //    });
        //}

        private void ScrapGlassDoor()
        {
            var reviewUrls = new List <string>();

            var browser = new HtmlWeb();

            foreach (var url in reviewUrls)
            {
                var doc = browser.Load(url);
                var interviewPageLink = doc.DocumentNode.SelectSingleNode("//a[contains(@class,'eiCell cell interviews')]");
                var interviewPage     = browser.Load(interviewPageLink.Attributes["href"].Value);

                var interviews = interviewPage.DocumentNode.SelectNodes("//div[contains(@class,'empReview')]");

                var interviewsResult = new List <InterviewModel>();
                foreach (var interview in interviews)
                {
                    var model = new InterviewModel
                    {
                        PostedOn      = interview.SelectSingleNode("//time[contains(@class,'date subtle')]").InnerText.Trim(),
                        AboutEmployee = interview.SelectSingleNode("//div[contains(@class,'author')]").InnerText.Trim(),
                        Description   = interview.SelectSingleNode("//p[contains(@class,'interviewDetails')]").InnerText.Trim(),
                    };
                }
            }
        }
Beispiel #2
0
        private void ScrapAmbitionBoxInterviews()
        {
            var browser       = new HtmlWeb();
            var interviewList = new List <InterviewModel>();

            var interviewUrls = new List <string>();

            if (File.Exists(@"C:\Users\Ashish\Desktop\InterviewUrls.txt"))
            {
                interviewUrls = File.ReadAllLines(@"C:\Users\Ashish\Desktop\InterviewUrls.txt").ToList();
            }
            else
            {
                for (int i = 1; i < 150; i++)
                {
                    var url          = "https://www.ambitionbox.com/interviews/companies" + "?page=" + i;
                    var comapniesDoc = browser.Load(url);
                    var companies    = comapniesDoc.DocumentNode.SelectNodes("//div[contains(@class,'company_tile_wrap')]");
                    foreach (var company in companies)
                    {
                        var interviewLink = company.SelectSingleNode(".//div[contains(@class,'company_logo')]/a");
                        interviewUrls.Add(interviewLink.Attributes["href"].Value);
                        interviewUrls.Add(interviewLink.Attributes["href"].Value + "?page=2");
                        interviewUrls.Add(interviewLink.Attributes["href"].Value + "?page=3");
                    }
                }
            }

            foreach (var url in interviewUrls)
            {
                var doc         = browser.Load(url);
                var companyName = doc.DocumentNode.SelectSingleNode("//p[contains(@class,'h1')]")?.InnerText.Trim();
                if (string.IsNullOrWhiteSpace(companyName))
                {
                    continue;
                }

                var website    = doc.DocumentNode.SelectSingleNode("//div[contains(@class,'company-stats')]//tr[1]//tr[1]//a")?.Attributes["href"].Value.Trim();
                var interviews = doc.DocumentNode.SelectNodes("//*[@id='reviewsContainer']/article");

                if (interviews == null)
                {
                    continue;
                }

                foreach (var interview in interviews)
                {
                    var model = new InterviewModel
                    {
                        JobTitle      = interview.SelectSingleNode(".//h2[contains(@class,'review-title')]/a")?.InnerText.Trim(),
                        AboutEmployee = interview.SelectSingleNode(".//div[contains(@class,'user-id')]").InnerText.Trim(),
                        PostedOn      = interview.SelectSingleNode(".//div[contains(@class,'time meta-data')]//time").InnerText.Trim(),
                        Company       = companyName,
                        WebSite       = website,
                        Rounds        = new List <InterviewRoundModel>()
                    };

                    var overallExp = interview.SelectNodes(".//p[contains(@class,'overall_experience_text')]");
                    if (overallExp != null && overallExp.Count > 0)
                    {
                        model.OverAllExp = overallExp[0].InnerText.Trim();
                    }

                    var desc = interview.SelectNodes(".//p[contains(@class,'job_source_text')]");
                    if (desc != null && desc.Count > 0)
                    {
                        model.Description = desc[0].InnerText.Trim();
                    }

                    var rounds = interview.SelectNodes(".//div[contains(@class,'interview_round_wrap')]");
                    if (rounds != null)
                    {
                        foreach (var round in rounds)
                        {
                            var roundModel = new InterviewRoundModel
                            {
                                Name      = round.SelectSingleNode("./h3").InnerText.Trim(),
                                Questions = new List <QuestionModel>()
                            };

                            var description = round.SelectSingleNode(".//p[contains(@class,'row_description')]");
                            if (description != null)
                            {
                                roundModel.Description = description.InnerText.Trim();
                            }
                            else
                            {
                                var questions = round.SelectNodes(".//div[contains(@class,'row_description')]//ul[contains(@class,'questions')]//a");
                                if (questions != null && questions.Count > 0)
                                {
                                    foreach (var ques in questions)
                                    {
                                        var quesModel = new QuestionModel
                                        {
                                            Desc = ques.Attributes["title"].Value.Trim()
                                        };
                                        roundModel.Questions.Add(quesModel);
                                    }
                                }
                            }

                            model.Rounds.Add(roundModel);
                        }
                    }
                    interviewList.Add(model);
                }

                System.Console.WriteLine("Scrapped: " + url);
            }

            var data = JsonConvert.SerializeObject(interviewList);

            File.WriteAllText(@"C:\Users\Ashish\Desktop\Interviews(Ambition).json", data);
        }