Example #1
0
        private LinkedinItem parse_details(Page page)
        {
            LinkedinItem item = new LinkedinItem();

            var htmldoc = page.Document.DocumentNode.SelectSingleNode("//div[contains(@id, 'top-card')]");

            item.Name        = htmldoc.SelectSingleNode(".//div[contains(@id, 'name')]//span[contains(@class, 'full-name')]").Extract();
            item.Description = HtmlAgilityPackExtensions.RemoveHtmlTags(htmldoc.SelectSingleNode(".//div[contains(@id, 'headline')]/p[contains(@class, 'title')]").Extract());
            item.Image       = htmldoc.SelectSingleNode(".//div[contains(@class, 'profile-picture')]//img").Extract("src");
            item.Email       = htmldoc.SelectSingleNode(".//div[contains(@id, 'email')]//ul//li").Extract();
            item.Phone       = htmldoc.SelectNodes(".//div[contains(@id, 'phone')]//ul//li").Extract();

            return(item);
        }
Example #2
0
        protected override LinkedinItem Crawl(Page page)
        {
            if (page.Url.depth == -1) //root
            {
                var htmldoc = page.Document.DocumentNode.SelectSingleNode("//code[contains(@id, 'voltron_srp_main-content')]").ExtractDecode(false);
                //remove <!-- -->
                htmldoc = htmldoc.Remove(0, 4);
                htmldoc = htmldoc.Remove(htmldoc.Length - 3, 3);

                dynamic jsonData = JsonSerializer.Deserialize <dynamic>(htmldoc);

                var root    = jsonData.content.page.voltron_unified_search_json.search;
                var results = root.results;

                foreach (var item in results)
                {
                    var description = HtmlAgilityPackExtensions.RemoveHtmlTags(item.person.fmt_headline.Value);

                    if (CheckString(description, _searchWord))
                    {
                        //add the details itens
                        AddProcess(item.person.link_nprofile_view_headless.Value, page);
                    }
                }

                //we only run the roots pages 1 by 1 so we dont need have lock the count
                if (_count <= 10)// crawl 10 pagination pages
                {
                    Interlocked.Increment(ref _count);
                    //add the pagination url
                    AddProcess(page.CleanUrl(root.baseData.resultPagination.nextPage.pageURL.Value));
                }
                return(null);
            }

            return(parse_details(page));
        }