示例#1
0
        public DeveloperData(HtmlDocument document, string url)
        {
            this.url = url;
            HtmlNode root = document.DocumentNode;

            name = root.SelectSingleNode("//h1[@class='cluster-heading']").InnerText.Trim();

            HtmlNodeCollection nodes = root.SelectNodes("//div[@class='card-list']//a[@class='title']");

            appsNames = nodes.Select(el => el.InnerText.Trim()).ToList();
            List <string> shortenUrls = nodes.Select(el => el.GetAttributeValue("href", null)).ToList();

            if (shortenUrls.All(el => el.StartsWith(@"/store/apps/details?id=")))
            {
                appsIDs = shortenUrls.Select(el => el.Replace(@"/store/apps/details?id=", "")).ToList();
            }
            else if (shortenUrls.All(el => el.StartsWith(@"https://play.google.com/store/apps/details?id=")))
            {
                appsIDs = shortenUrls.Select(el => el.Replace(@"https://play.google.com/store/apps/details?id=", "")).ToList();
            }
            else
            {
                throw new Exception("incorrect developer apps IDs");
            }
        }
示例#2
0
        public List <string> ParseList(XpathExtractModel model)
        {
            if (model == null || string.IsNullOrEmpty(model.XpathRule))
            {
                return(null);
            }

            HtmlNodeCollection nodes = this.htmlNode?.SelectNodes(model.XpathRule);

            if (nodes == null || nodes.Count <= 0)
            {
                return(null);
            }

            if (model.XpathEndAttributes != null && model.XpathEndAttributes.Count > 0)
            {
                return(nodes.Select(n => n.Attributes.Where(a => model.XpathEndAttributes.Contains(a.Name)).Select(a => a.Value?.Trim()).FirstOrDefault()).
                       Where(n => !string.IsNullOrEmpty(n))
                       .ToList());
            }

            switch (model.ExtractType)
            {
            case ExtractType.Text:
                return(nodes.Select(n => n.InnerText.Trim()).Where(n => !string.IsNullOrEmpty(n)).ToList());

            case ExtractType.Html:
            default:
                return(nodes.Select(n => n.InnerHtml.Trim()).Where(n => !string.IsNullOrEmpty(n)).ToList());
            }
        }
        public static List <clsCricketMatches> GetLiveMatches()
        {
            //Local variables
            List <clsCricketMatches> objCricketMatches = new List <clsCricketMatches>();

            try
            {
                //return null;

                HtmlWeb            objHtmlWeb        = new HtmlWeb();
                HtmlDocument       objHtmlDoc        = objHtmlWeb.Load("https://www.cricbuzz.com/cricket-match/live-scores");
                HtmlNodeCollection objHtmlClassNodes = objHtmlDoc.DocumentNode.SelectNodes("//div[contains(@class, 'cb-col cb-col-100 cb-lv-main')]");
                List <string>      objHtmlHrefNodes  = objHtmlClassNodes.Select(x => x.SelectSingleNode(".//a").Attributes["href"].Value).ToList();
                var           lstOfMatchNoSplits     = objHtmlHrefNodes.Select(x => x.Split('/'));
                List <string> lstOfMatchNos          = lstOfMatchNoSplits.Select(x => "https://www.cricbuzz.com/match-api/" + x[2] + "/commentary.json").ToList();
                List <string> objMatchTitles         = objHtmlClassNodes.Select(x => x.SelectSingleNode(".//a").Attributes["title"].Value).ToList();
                objCricketMatches = objMatchTitles.Zip(lstOfMatchNos, (strMatchName, strMatchLink) => new clsCricketMatches {
                    objMatchName = strMatchName, objMatchHyperLink = strMatchLink
                }).ToList();

                //Dictionary<string, string> objDicOfMatchDetails = objMatchTitles.Zip(lstOfMatchNos, (k, v) => new { k, v }).ToDictionary(x => x.k, x => x.v);
                //return objDicOfMatchDetails;
                return(objCricketMatches);
            }
            catch (Exception)
            {
                return(null);
            }
        }
示例#4
0
        List <VideoInfo> getVideos(Category category)
        {
            List <VideoInfo> videos = new List <VideoInfo>();

            string url = (category as RssLink).Url;

            var document = GetWebData <HtmlDocument>(url).DocumentNode;
            HtmlNodeCollection videoNodes = null;

            // Check for an 'all episdoes' link if we couldn't find any episodes
            var allEpisodes = document.SelectSingleNode(@"//a[starts-with(@href, '/iplayer/episodes/')]");

            if (allEpisodes != null)
            {
                document   = GetWebData <HtmlDocument>(BASE_URL + allEpisodes.GetAttributeValue("href", "")).DocumentNode;
                videoNodes = document.SelectNodes(@"//div[contains(@class, 'content-item')]");
            }

            if (videoNodes == null)
            {
                videoNodes = document.SelectNodes(@"//div[contains(@class, 'content-item')]");
            }

            // Single video
            if (videoNodes == null)
            {
                var videoNode = document.SelectSingleNode(@"//div[@id='main']");
                if (videoNode != null)
                {
                    VideoInfo video = createSingleVideo(videoNode, url);
                    if (video != null)
                    {
                        videos.Add(video);
                    }
                }
                return(videos);
            }

            videos.AddRange(videoNodes.Select(v => createVideo(v, category.Name)).Where(v => v != null));

            int pageCount   = getPageCount(document);
            int currentPage = 1;

            while (currentPage < pageCount)
            {
                currentPage++;
                document   = GetWebData <HtmlDocument>(url + "?page=" + currentPage).DocumentNode;
                videoNodes = document.SelectNodes(@"//div[contains(@class, 'content-item')]");
                if (videoNodes == null)
                {
                    break;
                }
                videos.AddRange(videoNodes.Select(v => createVideo(v, category.Name)).Where(v => v != null));
            }
            return(videos);
        }
        private static ClassDay ParseToClassDay(HtmlNodeCollection nodes, string day, List <string> schedules)
        {
            if (day.Equals("Sexta"))
            {
                Console.Write("adasdad");
            }
            //Abreviacao do nome da disciplina.
            var nickNames = nodes.Select(x => x.InnerText.Trim()).ToList();

            //Embora no site esteja a abeviacao do nome, optei por preencher os objetos com o nome completo
            var names = nodes.Select(a => a.GetAttributeValue("title", "")).ToList();

            //Remove um codigo estranho que fica na frente do nome, ex: #1047 - nome
            Regex reg = new Regex("^[^a-zA-Z]+");

            names = names.Select(s => reg.Replace(s, String.Empty)).ToList();

            var disciplines = new List <TimeTableDiscipline>();

            for (int i = 0; i < nickNames.Count; i++)
            {
                disciplines.Add
                (
                    new TimeTableDiscipline()
                {
                    NickName = nickNames[i],
                    Name     = names[i]
                }
                );
            }

            //Dicionario tem a funcao de garantir que o horario estara na mesma posicado do site
            var dayDisciplines = new Dictionary <string, TimeTableDiscipline>();

            for (int i = 0; i < disciplines.Count; i++)
            {
                dayDisciplines.Add(schedules[i], disciplines[i]);
            }

            return(new ClassDay()
            {
                DayOfWeek = day,
                FirstClass = GetValue(dayDisciplines, "1"),
                SecondClass = GetValue(dayDisciplines, "2"),
                ThirdClass = GetValue(dayDisciplines, "3"),
                FourthClass = GetValue(dayDisciplines, "4"),
            });
        }
示例#6
0
        private static List <int> GetNewSearchPageNumbers(HtmlDocument doc)
        {
            List <int> result = new List <int>();

            HtmlNodeCollection hnc = doc.DocumentNode.SelectNodes("//table[@class='searchresultpaging'][1]/tr/td[2]/a");
            HtmlNode           hn  = doc.DocumentNode.SelectSingleNode("//table[@class='searchresultpaging'][1]/tr/td[2]/a/p/b");
            int  currentPageNumber = Convert.ToInt32(hn.InnerText);
            bool flag = false;

            foreach (int pageNumber in hnc.Select(n => Convert.ToInt32(n.InnerText)))
            {
                if (pageNumber == currentPageNumber)
                {
                    flag = true;
                }
                if (pageNumber != currentPageNumber && !_pageNumbers.Contains(pageNumber) && flag)
                {
                    lock (_pageNumbers)
                    {
                        _pageNumbers.Add(pageNumber);
                        result.Add(pageNumber);
                    }
                }
            }

            return(result);
        }
示例#7
0
        private void Page_onPageDownload(object sender, HTMLPageEventArgs e)
        {
            //dw получаем список раздач с загруженной страницы
            HtmlNodeCollection htmlNodes = e.Page.DocumentNode.SelectNodes(@"//div[@id=""index""]//tr[position()>1]/td[2]");
            //up если нужного узла не будет - null

            //dw необходимо для добавления корректной ссылки на страницу раздачи
            //по умолчанию ссылка парситься без домена первого уровня
            string rutorMainUrl = MainFunc.rutorWorkURL.Replace(@"/soft", "");

            //dw если список раздач получен
            if (htmlNodes != null)
            {
                List <TrackersListItem> postLst;
                postLst = htmlNodes.Select((el, i) => new TrackersListItem
                {
                    //dw HtmlDecode необходим чтобы привести HTML escape последовательности
                    //в нормальный вид
                    Name = HttpUtility.HtmlDecode(el.LastChild.InnerText),
                    //dw добавляем в ссылку домен первого уровня
                    Href   = rutorMainUrl + el.LastChild.GetAttributeValue("href", null),
                    Index  = i,
                    Magnet = el.ChildNodes[1].GetAttributeValue("href", null),
                }).ToList();

                //подготавливаем аргументы для события
                RutorListEventArgs eventArgs = new RutorListEventArgs(postLst);
                //вызываем событие. аналог if(onPostReceived!=null)onPostReceived(arg);
                OnPostReceived?.Invoke(this, eventArgs);
            }
            else
            {
                Program.statusBarGlobal.Message = "Ошибка на этапе парсинга страницы";
            }
        }
示例#8
0
        public async Task <IList <WebPage> > SearchAsync(string query, int page)
        {
            HttpResponseMessage response = await _client.GetAsync(SearchUri(query, page));

            if (response.IsSuccessStatusCode)
            {
                string data = await response.Content.ReadAsStringAsync();

                var doc = new HtmlDocument();
                doc.LoadHtml(data);

                HtmlNodeCollection nodes = doc.DocumentNode.SelectNodes("//div[@class='g']");

                if (nodes != null)
                {
                    return(nodes.Select(node => {
                        var titleNode = node.Descendants("h3").Where(x => x.Attributes.Contains("class"))
                                        .FirstOrDefault(x => x.Attributes["class"].Value.Contains("LC20lb"));
                        var linkNode = node.Descendants("a")
                                       .FirstOrDefault();
                        var snippetNode = node.Descendants("span").Where(x => x.Attributes.Contains("class"))
                                          .FirstOrDefault(x => x.Attributes["class"].Value.Contains("st"));

                        var title = HttpUtility.HtmlDecode(titleNode?.InnerText);
                        var link = HttpUtility.HtmlDecode(linkNode?.Attributes["href"].Value);
                        var snippet = HttpUtility.HtmlDecode(snippetNode?.InnerText);

                        return new WebPage(query, title, link, snippet, searchTag);
                    })
                           .ToList());
                }
            }
            return(new List <WebPage>());
        }
示例#9
0
文件: Program.cs 项目: hafei/Alpaca
        static void Main(string[] args)
        {
            //Logger
            var loggerFactory = new LoggerFactory().AddConsole();
            var logger        = loggerFactory.CreateLogger(typeof(Program));

            //Configuration
            var builder = new ConfigurationBuilder()
                          .SetBasePath(Directory.GetCurrentDirectory())
                          .AddJsonFile("config.json");
            var config = builder.Build();
            var site   = config.GetSection("Site").Value;

            //logger.LogInformation(site);

            var html = HTTPUtil.GetHtml(site);
            //HtmlDocument doc = new HtmlDocument();
            //HtmlWeb web = new HtmlWeb();
            HtmlDocument doc = new HtmlDocument();

            //logger.LogInformation(html);
            doc.LoadHtml(html);

            //XPath 语法
            HtmlNodeCollection   categoryNodes = doc.DocumentNode.SelectNodes("//h3/a[@class='titlelnk']");
            IEnumerable <string> category      = categoryNodes.Select(x => x.Attributes["href"].Value + "  " + x.InnerText).ToList();

            foreach (var item in category)
            {
                logger.LogInformation(item);
            }

            Console.ReadLine();
        }
示例#10
0
文件: Taiwan.cs 项目: ghpdev2019/Rate
        public static DataMeta TaiwanBK_GetRate(this HtmlNodeCollection Node, EnumBank EnumBank)
        {
            DataMeta Table     = new DataMeta();
            RateData TempTable = new RateData();

            Table.CreateDate = MyTimeZone.Today;
            Table.Expire     = MyTimeZone.Today.AddMinutes(Expire);
            Table.Key        = EnumBank;

            //int LIndex = (int)EnumRate.幣別;
            foreach (var item in Node.Select((value, index) => new { index, value }))
            {
                Table.Data.Add(new RateData()
                {
                    Currencty = item.value.SelectNodes(@"//div[@class=""hidden-phone print_show""]")[item.index].InnerText
                                .Replace("\r\n", "")
                                .Replace("              ", "")
                                .Replace(" ", "")
                                .Replace("(", "")
                                .Replace(")", "")
                                .TrimStart(' ').TrimEnd(' ').Replace("\r\n", ""),
                    CashBuying  = item.value.SelectNodes(@"//td[@data-table=""本行現金買入""]")[item.index].InnerText,
                    CashSelling = item.value.SelectNodes(@"//td[@data-table=""本行現金賣出""]")[item.index].InnerText,
                    SpotBuying  = item.value.SelectNodes(@"//td[@data-table=""本行即期買入""]")[item.index].InnerText,
                    SpotSelling = item.value.SelectNodes(@"//td[@data-table=""本行即期賣出""]")[item.index].InnerText
                });
            }
            return(Table);
        }
        public IEnumerable <string> GetStylesheets()
        {
            HtmlNodeCollection   collection = this.document.DocumentNode.SelectNodes("//link[@rel='stylesheet']");
            IEnumerable <string> links      = collection?.Select(node => node.GetAttributeValue("href", string.Empty));

            return(links ?? new List <string>());
        }
        public IEnumerable <string> GetScripts()
        {
            HtmlNodeCollection   collection = this.document.DocumentNode.SelectNodes("//script[@src]");
            IEnumerable <string> links      = collection?.Select(node => node.GetAttributeValue("src", string.Empty));

            return(links ?? new List <string>());
        }
示例#13
0
        private void GetPageCourseData()
        {
            //1. 确定总页数
            //2. 分别抓取每一页的数据
            //3. 分析  过滤  清洗
            //4. 入库

            category.Url = $"https://ke.qq.com{category.Url}";

            string       strHtml  = HttpHelper.DownloadUrl(category.Url);
            HtmlDocument document = new HtmlDocument();

            document.LoadHtml(strHtml);
            //Xpath
            string             pagePath  = "/html/body/section[1]/div/div[@class='sort-page']/a[@class='page-btn']";
            HtmlNodeCollection pageNodes = document.DocumentNode.SelectNodes(pagePath);

            int pageCount = 1;

            if (pageNodes != null)
            {
                pageCount = pageNodes.Select(a => int.Parse(a.InnerText)).Max();
            }
            List <CourseEntity> courseList = new List <CourseEntity>();

            for (int pageIndex = 1; pageIndex <= pageCount; pageIndex++)
            {
                Console.WriteLine($"******************************当前是第{pageIndex}页数据************************************");
                string pageIndexUrl = $"{category.Url}&page={pageIndex}";
                List <CourseEntity> courseEntities = GetPageIndeData(pageIndexUrl);
                courseList.AddRange(courseEntities);
            }
            //courseRepository.SaveList(courseList);
        }
示例#14
0
        private void GetTitle()
        {
            string strContent
                = m_wd.GetPageByHttpWebRequest(this.textBoxUrl.Text, Encoding.UTF8);

            HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument
            {
                OptionAddDebuggingAttributes = false,
                OptionAutoCloseOnEnd         = true,
                OptionFixNestedTags          = true,
                OptionReadEncoding           = true
            };

            htmlDoc.LoadHtml(strContent);
            string             strTitle = "";
            HtmlNodeCollection nodes    = htmlDoc.DocumentNode.SelectNodes("//title");

            // Extract Title
            if (!Equals(nodes, null))
            {
                strTitle = string.Join(";", nodes.
                                       Select(n => n.InnerText).
                                       ToArray()).Trim();
            }
            strTitle = strTitle.Replace("博客园", "");
            strTitle = Regex.Replace(strTitle, @"[|/\;:*?<>&#-]", "").ToString();
            strTitle = Regex.Replace(strTitle, "[\"]", "").ToString();
            this.textBoxTitle.Text = strTitle.TrimEnd();
        }
示例#15
0
        private void Downloader_FinishDownload(object sender, DownloaderHtmlPageArgs e)
        {
            // если нужного узла не будет - null
            HtmlNodeCollection htmlNodes = e.Page.DocumentNode.SelectNodes(@"//div[@id=""index""]//tr[position()>1]/td[2]");

            // необходимо для добавления корректной ссылки на страницу раздачи
            // по умолчанию ссылка парситься без домена первого уровня
            string rutorMainUrl = UriWork.OriginalString.Replace(@"/soft", "");

            if (htmlNodes != null)
            {
                List <ItemList> postsList;
                postsList = htmlNodes.Select((el, i) => new ItemList
                {
                    // HtmlDecode необходим чтобы привести HTML escape последовательности
                    // в нормальный вид
                    Name   = HttpUtility.HtmlDecode(el.LastChild.InnerText),
                    Href   = rutorMainUrl + el.LastChild.GetAttributeValue("href", null),
                    Index  = i,
                    Magnet = el.ChildNodes[1].GetAttributeValue("href", null),
                }).ToList();

                ItemListArgs eventArgs = new ItemListArgs(postsList);
                //вызываем событие. аналог if(onPostReceived!=null)onPostReceived(arg);
                ListReceived?.Invoke(this, eventArgs);
            }
            else
            {
                Program.statusBarGlobal.Message = "Ошибка на этапе парсинга страницы";
            }
        }
示例#16
0
        /// <summary>
        /// Fetches a list of <see cref="NewsItem"/> from the given stream which should point to http://heise.de
        /// </summary>
        /// <param name="documentStream">Stream to fetch <see cref="NewsItem"/> from</param>
        /// <returns>List of <see cref="NewsItem"/></returns>
        public override IEnumerable <NewsItem> GetNewsItemsFromStream(Stream documentStream)
        {
            if (documentStream == null)
            {
                throw new ArgumentNullException("documentStream");
            }
            HtmlDocument doc = new HtmlDocument();

            doc.Load(documentStream, Encoding.Default);

            HtmlNodeCollection allArticleContainers = doc.DocumentNode.SelectNodes("/descendant::article");

            List <HtmlNode> articleLinks = allArticleContainers.Select(d => d.SelectSingleNode("a")).ToList();

            // Some articles are null, remove them
            articleLinks.RemoveAll(item => item == null);

            IEnumerable <NewsItem> items = articleLinks.Select(article => new NewsItem(
                                                                   article.GetAttributeValue("title", null),
                                                                   article.GetAttributeValue("href", null),
                                                                   // some articles don't contain a paragraph, set them to null
                                                                   (article.SelectSingleNode("div/p") != null) ? article.SelectSingleNode("div/p").InnerHtml : null
                                                                   ));

            return(items);
        }
示例#17
0
        /// <summary>///html/body/div[4]/div[1]/div[6]/div[1]/ul/div[1]/div[2]/ol/li/a
        /// 验证一个List的模式是否能应用于某一个页面(只是检查是否明显不可能)
        /// 与下面的函数都是直接从ListStrategy里copy过来的
        /// </summary>
        /// <param name="Url"></param>
        /// <param name="HTML"></param>
        /// <param name="XPath"></param>
        /// <returns></returns>
        public bool ValidateListXPath(string Url, string HTML, XpathPattern XPath)
        {
            //获取root节点(有些网站页面不带html标签的,直接从head开始写)
            HtmlNode rootNode = HtmlUtility.getSafeHtmlRootNode(HTML, true, true);

            if (rootNode == null)
            {
                return(false);
            }

            HtmlNodeCollection rootNodes = rootNode.SelectNodes(XPath.ItemRootXPath);

            if (rootNodes == null)
            {
                return(false);
            }

            var TitleNode = rootNodes.Select(f => f.SelectSingleNode(XPath.TitleXPath)).Where(f => f != null);

            if (TitleNode == null || TitleNode.Count() == 0 || (TitleNode.Count() == 1 && TitleNode.FirstOrDefault() == null))
            {
                return(false);
            }

            //获取时有可能第一个为空
            TitleNode = TitleNode.Where(f => f != null);

            List <HtmlNode> TitleNodes = TitleNode.Where(a => !string.IsNullOrEmpty(a.InnerText)).ToList();
            double          Score      = ScoreforListTitle(TitleNodes);

            return((Score > Threshold.LeastTitleScore || (Url.Contains("tieba.baidu.com") && Score > 100)) && ValidateListXPath(Url, rootNode, XPath));
        }
示例#18
0
        protected override ProcessorResult ProcessPage(CrawlContext crawlContext, CrawledPage crawledPage)
        {
            ProcessorResult result = new ProcessorResult
            {
                UniqueAttributeId = 222
            };

            Match regexResult = wordPressPattern.Match(crawledPage.RawContent);

            if (regexResult.Success)
            {
                result.Attributes.Add("siteBuilder", "BlogWordPress");
                result.IsAHit = true;
                return(result);
            }

            HtmlNodeCollection listhref = crawledPage.HtmlDocument.DocumentNode.SelectNodes("//a[@href]") ?? new HtmlNodeCollection(null);

            if (listhref.Select(node => node.GetAttributeValue("href", "")).Any(content => content.Contains("wordpress.org")))
            {
                result.Attributes.Add("siteBuilder", "BlogWordPress");
                result.IsAHit = true;
                return(result);
            }

            return(result);
        }
示例#19
0
        private IEnumerable <string> GetPageUrls()
        {
            HtmlDocument       doc   = _webClient.Load(siteIndex);
            HtmlNodeCollection nodes = doc.DocumentNode.SelectNodes(pageUrlPath);

            return(nodes.Select(node => node.Attributes["href"].Value));
        }
示例#20
0
        public void CrawlerAll()
        {
            try
            {
                if (string.IsNullOrWhiteSpace(category.Url))
                {
                    Console.WriteLine($"分类的链接为空{category.Name}{category.CategoryLevel}");
                }
                else
                {
                    string html = HttpHelper.DownLoad(category.Url, Encoding.UTF8);

                    //解析网页
                    HtmlDocument document = new HtmlDocument();
                    document.LoadHtml(html);
                    //通过xpath解析

                    string             pageXPath = "/html/body/section[1]/div/div[6]/a[@class='page-btn']";
                    HtmlNodeCollection pageNodes = document.DocumentNode.SelectNodes(pageXPath);
                    int pageCount = pageNodes.Select(a => Convert.ToInt32(a.InnerText)).OrderByDescending(a => a).FirstOrDefault();

                    List <CourseEntity> courses = new List <CourseEntity>();
                    for (int i = 1; i <= pageCount; i++)
                    {
                        string pageUrl = $"{category.Url}&page={i}";
                        html = HttpHelper.DownLoad(pageUrl, Encoding.UTF8);
                        //解析网页
                        HtmlDocument pageDocument = new HtmlDocument();
                        pageDocument.LoadHtml(html);
                        string             liXPath        = "/html/body/section[1]/div/div[4]/ul/li";
                        HtmlNodeCollection pageLiNodeList = pageDocument.DocumentNode.SelectNodes(liXPath);
                        foreach (var liNode in pageLiNodeList)
                        {
                            var course = GetOneCourse(liNode);
                            courses.Add(course);
                        }
                        if (i == 3)
                        {
                            break;
                        }
                    }
                    string direct    = $"{System.AppDomain.CurrentDomain.BaseDirectory}CrawlerFile\\";;
                    string sheetName = "course";
                    string Name      = $"{sheetName}{DateTime.Now.ToString("yyyyMMddHHmmss")}.xls";// sheetName  + DateTime.Now.ToString("yyyyMMddHHmmss") + ").xls";

                    if (!Directory.Exists(direct))
                    {
                        Directory.CreateDirectory(direct);
                    }
                    ExcelHelper eh        = new ExcelHelper(direct + Name);
                    DataTable   dtCourses = ListToDataTable.ToDataTable <CourseEntity>(courses);
                    eh.DataTableToExcel(dtCourses, sheetName, true);
                }
            }
            catch (Exception)
            {
                Console.WriteLine("Crawler抓取异常");
            }
        }
示例#21
0
 public static string TextAll(this HtmlNodeCollection htmlNodeCollection)
 {
     if (htmlNodeCollection is null)
     {
         return(null);
     }
     return(string.Join("", htmlNodeCollection.Select(e => e)).Trim());
 }
        protected void parsePage(SearchPage page)
        {
            if (isHaveParsed(page))
            {
                return;
            }

            HttpClient   client;
            HtmlDocument document = new HtmlDocument();
            string       body     = default(string);
            bool         hasNavigationMenu;

            using (client = getNewHttpClient())
            {
                body = client.GetStringAsync(URL + page.CURRENT_PAGE).Result;
            }

            if (string.IsNullOrWhiteSpace(body))
            {
                throw new Exception("Empty body");
            }

            document.LoadHtml(body);
            hasNavigationMenu = checkNavigationMenu(document);

            if (!hasNavigationMenu)
            {
                //throw new Exception("No navigation menu");
                Tools._pause(5000, 10000);
                parsePage(page);

                return;
            }

            if (isHaveParsed(page))
            {
                return;
            }

            updateLastPagePosition(document);

            HtmlNodeCollection nodes = document.DocumentNode.SelectNodes(Node.TABLE_SEARCH_RESULT);

            if (nodes == null)
            {
                //throw new Exception("No data in body (maybe need enter the captcha)");
                Tools._pause(5000, 10000);
                parsePage(page);

                return;
            }

            links = nodes
                    .Select(a => a.GetAttributeValue("href", String.Empty))
                    .Where(link => !string.IsNullOrWhiteSpace(link) && link != "javascript:void(0)")
                    .ToList();
        }
示例#23
0
        public async Task <string> GetGoogleResultsAsync(string searchTerm, string url)
        {
            Search search = new Search
            {
                Query = searchTerm,
                Date  = DateTime.Now
            };

            Result result = new Result()
            {
                UrlAnalyzed = url,
            };

            int maxResultPosition = 100;

            searchTerm = searchTerm.Replace(' ', '+');

            if (url.Contains("www"))
            {
                url = url.Replace("www.", "").Split('.')[0];
            }

            string searchUrl = $"http://www.google.co.uk/search?num={maxResultPosition}&q={searchTerm}";
            // need to process to get the real URL of the question.

            HtmlDocument response = new HtmlWeb().Load(searchUrl);

            //resultContent.DocumentNode.SelectNodes("//span[contains(@class, 'BNeawe')]")
            HtmlNodeCollection nodes = response.DocumentNode.SelectNodes("//div[contains(@class, 'BNeawe UPmit AP7Wnd')]");

            if (nodes != null)
            {
                string            stringResult = "";
                IEnumerable <int> indexes      = nodes.Select((x, i) => new { i, x.InnerHtml })
                                                 .Where(x => x.InnerHtml.Contains(url))
                                                 .Select(x => x.i + 1);

                foreach (int i in indexes)
                {
                    stringResult += $"{i},";
                }

                stringResult = stringResult.TrimEnd(',');


                result.Position = stringResult;
                search.Result   = result;

                _context.Searches.Add(search);
                _context.SaveChanges();

                return(stringResult);
            }

            return("");
        }
示例#24
0
        public static List <string> GetListUrls(string pageUrl)
        {
            var          web = new HtmlWeb();
            HtmlDocument doc = web.Load(pageUrl);
            //HtmlNode node = doc.DocumentNode.SelectSingleNode("...");
            HtmlNodeCollection nodes = doc.DocumentNode.SelectNodes("//div[@class='r-list-container action-bar-margin bbs-screen']/div[@class='r-ent']/div[@class='title']/a");
            List <string>      urls  = nodes.Select(n => n.Attributes["href"].Value).ToList();

            return(urls);
        }
示例#25
0
        public static List <string> GetArticleText(HtmlNode parentNode)
        {
            HtmlNode           mainContent = GetMainContent(parentNode);
            HtmlNodeCollection paragraphs  = mainContent.SelectNodes("//p");

            if (paragraphs == null)
            {
                return(null);
            }
            return(paragraphs.Select(n => n.InnerText).ToList());
        }
        public async Task <IEnumerable <News> > GetNewsAsync()
        {
            string html = await GetHtmlAsync(NewsUrl);

            var document = new HtmlDocument();

            document.LoadHtml(html);
            HtmlNode           body    = document.DocumentNode.SelectSingleNode("//body");
            HtmlNodeCollection entries = body.SelectNodes("//div[@class='post text']");

            return(entries.Select(ParseNode));
        }
        private IEnumerable <CityData> CreateCityData(HtmlNodeCollection cityHtml)
        {
            var cityData = cityHtml?.Select(node =>
                                            new CityData()
            {
                Name = _htmlParser.ParseText(node),
                Url  = _pageHandler.CreateLunUaUrl(_htmlParser.ParseHref(node))
            }
                                            );

            return(cityData);
        }
示例#28
0
        private IEnumerable <ApartComplexesGroupData> CreateApartComplexData(HtmlNodeCollection cityHtml)
        {
            var apartComplexesData = cityHtml?.Select(node =>
                                                      new ApartComplexesGroupData()
            {
                CityName = _htmlParser.ParseText(node),
                Url      = _pageHandler.CreateDomRiaUrl(_htmlParser.ParseHref(node))
            }
                                                      );

            return(apartComplexesData);
        }
示例#29
0
        protected virtual decimal GetValue(HtmlNode oddNode)
        {
            decimal parsedValue = 0;

            HtmlNodeCollection nodeCollection = oddNode.SelectNodes(OddXPaths.VALUE);

            if (nodeCollection != null)
            {
                string value = nodeCollection.Select(x => x.InnerText).First();
                decimal.TryParse(value, out parsedValue);
            }

            return(parsedValue);
        }
示例#30
0
        private string SafeGetAttributeValue(HtmlNodeCollection collection, string name)
        {
            string result = string.Empty;
            var    attr   =
                collection
                .Select(n => n.Attributes.FirstOrDefault(a => a.Name == name))
                .FirstOrDefault();

            if (attr != null)
            {
                result = attr.Value;
            }
            return(result);
        }