示例#1
0
        private void GetImageList(List <HtmlNode> rlist, string url)
        {
            string html = helper.GetHTML(url);

            HtmlAgilityPack.HtmlDocument document = new HtmlAgilityPack.HtmlDocument();
            document.LoadHtml(html);

            HtmlNode rootNode  = document.DocumentNode;
            HtmlNode titleNode = rootNode.SelectSingleNode("//*[@id=\"gj\"]");

            HtmlNodeCollection imglist = rootNode.SelectNodes("//*[@id=\"gdt\"]/div");

            rlist.AddRange(imglist.ToList());

            HtmlNodeCollection pageList = rootNode.SelectNodes("//*[@class=\"ptb\"]/*/td");


            HtmlNode last = pageList.Last();

            if (last.InnerHtml == "&gt;")
            {
                return;
            }

            string purl = pageList.Last().ChildNodes[0].Attributes["href"].Value;

            GetImageList(rlist, purl);
        }
示例#2
0
文件: Form1.cs 项目: wyhb/HapSample
        private bool checkJs(HtmlNodeCollection list)
        {
            var rtn = false;

            foreach (var x in list.ToList())
            {
                var sFlg   = x.InnerHtml.Contains("<script>");
                var sStart = x.InnerHtml.IndexOf("<script>");
                var sEnd   = x.InnerHtml.LastIndexOf("</script>");

                var ltFlg   = x.InnerHtml.Contains("&lt;script&gt;");
                var ltStart = x.InnerHtml.IndexOf("&lt;script&gt;");
                var gtEnd   = x.InnerHtml.LastIndexOf("&lt;/script&gt;");

                var jsFlg   = x.InnerHtml.Contains("<javascript>");
                var jsStart = x.InnerHtml.IndexOf("<javascript>");
                var jsEnd   = x.InnerHtml.LastIndexOf("</javascript>");

                var jsltFlg   = x.InnerHtml.Contains("&lt;javascript&gt;");
                var jsltStart = x.InnerHtml.IndexOf("&lt;javascript&gt;");
                var jsgtEnd   = x.InnerHtml.LastIndexOf("&lt;/javascript&gt;");

                if ((sFlg && sStart < sEnd) || (ltFlg && ltStart < gtEnd) || (jsFlg && jsStart < jsEnd) || (jsltFlg && jsltStart < jsgtEnd))
                {
                    rtn = true;
                    break;
                }
            }
            return(rtn);
        }
示例#3
0
        public override string GetText(Chapter chp, HtmlDocument use, WebClient wc)
        {
            use.LoadHtml(Regex.Replace(wc.DownloadString(chp.chapterLink), "(<br>|<br/>)", "\n", RegexOptions.Singleline));
            GC.Collect();
            HtmlNode           a    = use.DocumentNode.SelectSingleNode("//*[@id=\"chapter-content\"]");
            HtmlNodeCollection aaab = use.DocumentNode.SelectNodes("//*[@dir=\"ltr\"]");
            List <HtmlNode>    aa   = new List <HtmlNode>();

            if (aaab != null)
            {
                aa = aaab.ToList();
            }
            else
            {
                use.LoadHtml(a.OuterHtml);
                aa = use.DocumentNode.SelectNodes("//p").ToList();
            }

            StringBuilder b = new StringBuilder();

            foreach (HtmlNode n in aa)
            {
                b.Append(HttpUtility.HtmlDecode(Regex.Unescape(n.InnerText) + "\n\n"));
            }
            return(b.ToString());
        }
示例#4
0
        private List <string> CheckPlaceholders()
        {
            List <string> errors = new List <string>();

            try{
                HtmlNodeCollection nodes = this.HtmlDoc.DocumentNode.SelectNodes("//input");
                if (nodes == null)
                {
                    errors.Add("Unable to find any placeholder.");
                }
                else
                {
                    List <HtmlNode> inputs = nodes.Where(x => !(new[] { "radio", "checkbox", "reset", "submit" }).Contains(x.GetAttributeValue("type", ""))).ToList();

                    nodes = this.HtmlDoc.DocumentNode.SelectNodes("//textarea");
                    if (nodes != null)
                    {
                        inputs.AddRange(nodes.ToList());
                    }

                    if (inputs.Where(x => x.Attributes.Where(y => y.Name == "placeholder").Count() < 1).Count() > 0)
                    {
                        errors.Add("Some fields does not have any defined placeholder.");
                    }
                }
            }
            catch (Exception e) {
                errors.Add(string.Format("EXCEPTION: {0}", e.Message));
            }

            return(errors);
        }
示例#5
0
        private List <string> CheckSelectFields()
        {
            List <string> errors = new List <string>();

            try{
                HtmlNodeCollection nodes = this.HtmlDoc.DocumentNode.SelectNodes("//select");
                if (nodes == null || nodes.Count < 1)
                {
                    errors.Add("Does not contains enough select fields.");
                }
                else
                {
                    errors.AddRange(CheckLabels(nodes.ToList(), "select"));
                }

                nodes = this.HtmlDoc.DocumentNode.SelectNodes("//select/option");
                if (nodes == null || nodes.Count < 3)
                {
                    errors.Add("The select field does not contains enough options.");
                }
                else
                {
                    if (nodes.Where(x => x.Attributes.Where(y => y.Name == "selected").Count() > 0).Count() != 1)
                    {
                        errors.Add("The select field does not have a single default option.");
                    }
                }
            }
            catch (Exception e) {
                errors.Add(string.Format("EXCEPTION: {0}", e.Message));
            }

            return(errors);
        }
        public static async Task <List <FindedSongObject> > SearchArtistSongs(FindedSongObject songToFind, Boolean useLinkToSongs = false)
        {
            IPageService            _pageService = new PageService();
            List <FindedSongObject> songs        = new List <FindedSongObject>();
            string html = null;
            string artistWithFirstCharUpper = char.ToUpper(songToFind.Artist[0]) + songToFind.Artist.Substring(1);
            string url = "";

            try
            {
                if (useLinkToSongs && !String.IsNullOrEmpty(songToFind.LinkToArtistSongs))
                {
                    url = $"https://www.tekstowo.pl/{songToFind.LinkToArtistSongs}";
                }
                else if (!String.IsNullOrEmpty(songToFind.WorkingArtist))
                {
                    url = $"https://www.tekstowo.pl/piosenki_artysty,{songToFind.WorkingArtist}.html";
                }
                else
                {
                    return(null);
                }
                HttpClient httpClient = new HttpClient();
                html = await httpClient.GetStringAsync(url);
            }
            catch (HttpRequestException)
            {
                await _pageService.DisplayAlert(AppResources.AlertDialog_SearchProblem, $"{AppResources.AlertDialog_CouldNotFindArtistsSongs} {artistWithFirstCharUpper}", AppResources.AlertDialog_OK);

                return(null);
            }
            finally
            {
                HtmlDocument htmlDocument = new HtmlDocument();
                htmlDocument.LoadHtml(html);

                // chech if is more than one pages of results

                HtmlNodeCollection queryPages = htmlDocument.DocumentNode.SelectNodes("//li[@class='page-item']");
                if (queryPages != null)
                {
                    var           listOfPages   = queryPages.ToList();
                    List <string> pagesHTMLLink = new List <string>();
                    if (!String.IsNullOrEmpty(url))
                    {
                        pagesHTMLLink.Add(url);
                    }

                    foreach (var div in listOfPages)
                    {
                        int indexStartLink = div.InnerHtml.IndexOf("href") + 7;
                        int indexEndLink   = div.InnerHtml.IndexOf("title=") - 2;
                        int checkIndexOfNextPageButtonElement = div.InnerHtml.IndexOf("tabindex=");
                        if (checkIndexOfNextPageButtonElement != -1)
                        {
                            continue;
                        }
                        if (indexStartLink != 0 && indexEndLink != 0)
                        {
                            string link = div.InnerHtml[indexStartLink..indexEndLink];
        static void dzial2(string result)
        {
            Console.WriteLine("DZIAŁ 2");
            HtmlDocument doc = new HtmlDocument();

            doc.LoadHtml(result);

            try
            {
                HtmlNodeCollection links = doc.DocumentNode.SelectNodes("//td[@class='csNDBDane']");

                var list = links.ToList();
                foreach (var node in list)
                {
                    node.Attributes.RemoveAll();
                    string[] test = node.InnerHtml.Split(new[] { Environment.NewLine }, StringSplitOptions.None);
                    //Console.BackgroundColor = ConsoleColor.DarkYellow;
                    Console.ForegroundColor = ConsoleColor.Yellow;
                    Console.WriteLine(Regex.Replace(test[0], "<.*?>", String.Empty));
                    Console.ForegroundColor = ConsoleColor.Gray;
                    Console.WriteLine("-------------");
                }
            }
            catch (Exception)
            {
                Console.WriteLine("ERROR");
            }
        }
示例#8
0
        /// <inheritdoc/>
        public override async Task <List <HtmlNode> > ExtractSearchEntryElementsAsync()
        {
            string searchUrl = GetSearchUrl();

            using (HttpResponseMessage response = await Client.GetAsync(searchUrl).ConfigureAwait(false))
            {
                using (HttpContent content = response.Content)
                {
                    string result = await content.ReadAsStringAsync().ConfigureAwait(false);

                    HtmlDocument document = new HtmlDocument();
                    document.LoadHtml(result);

                    HtmlNodeCollection htmlNodes = document.DocumentNode.SelectNodes($"//*[@id='search'][1]//*[@class='g']//descendant::cite[1]//ancestor::*[@class='g'][1]");

                    if (htmlNodes != null)
                    {
                        return(htmlNodes.ToList());
                    }
                    else
                    {
                        return(new List <HtmlNode>());
                    }
                }
            }
        }
示例#9
0
        public static List <WeiBoContentItem> GetWeiBoTopicContentV2(string topicName, string targetName = "")
        {
            List <WeiBoContentItem> res = new List <WeiBoContentItem>();
            HtmlWeb      webClient      = new HtmlWeb();
            HtmlDocument doc            = webClient.Load("https://s.weibo.com/weibo/" + topicName + "&Refer=weibo_weibo&xsort=time&realtimeweibo=1");

            doc.DocumentNode.InnerHtml = JavaScriptAnalyzer.Decode(doc.DocumentNode.InnerHtml);
            HtmlNodeCollection ContentList = doc.DocumentNode.SelectNodes("//div[@class='content clearfix']");

            //获取一个话题项
            ContentList.ToList().ForEach(p =>
            {
                var item = new WeiBoContentItem();
                //获取时间
                var timeItem    = p.SelectNodes(".//a[@class='W_textb']");
                item.Time       = Convert.ToDateTime(timeItem.FirstOrDefault()?.InnerText);
                var nickName    = p.SelectNodes(".//a[@class='W_texta W_fb']");
                item.Author     = nickName.FirstOrDefault()?.InnerText.Trim();
                var content     = p.SelectNodes(".//p[@class='comment_txt']");
                item.ContentStr = content.FirstOrDefault()?.InnerText.Trim();
                var pic         = p.SelectNodes(".//img[@action-type='feed_list_media_img']");
                item.Pic        = "https:" + pic.FirstOrDefault()?.Attributes.FirstOrDefault(c => c.Name == "src")?.Value.Replace("thumbnail", "large");
                res.Add(item);
            });
            return(res.Where(p => p.Author.Trim().Contains(targetName)).OrderByDescending(p => p.Time).ToList());
        }
示例#10
0
        /// <summary>
        /// Converts the MathML blocks to be readable by OneNote
        /// </summary>
        /// <param name="htmlDoc"></param>
        private void ConvertMathMl(HtmlDocument htmlDoc)
        {
            HtmlNodeCollection mathNodes = htmlDoc.DocumentNode.SelectNodes("//math");

            if (mathNodes == null)
            {
                return;
            }

            foreach (var mathNode in mathNodes.ToList())
            {
                mathNode.Attributes.RemoveAll();
                HtmlAttribute mathMlNamespaceAttr = htmlDoc.CreateAttribute("xmlns:mml", MathMlNameSpace);
                mathNode.Attributes.Add(mathMlNamespaceAttr);

                foreach (var node in mathNode.DescendantsAndSelf())
                {
                    node.Name = "mml:" + node.Name;
                }

                string          newMathMlString = String.Format(MathMlOutline, mathNode.OuterHtml);
                HtmlCommentNode newMathNode     = htmlDoc.CreateComment(newMathMlString);
                mathNode.ParentNode.ReplaceChild(newMathNode, mathNode);
            }
        }
示例#11
0
        private List <CarInfo> ScrapMultiple(PagingInfo pagingInfo, DealerInfo dealer, ISelector selector)
        {
            var result = new List <CarInfo>();

            foreach (var pagedUrl in pagingInfo.PagedUrls)
            {
#if DEBUG
                var s = DateTime.Now;
#endif
                //HtmlAgilityPack.HtmlDocument doc = LoadWebSiteAsync(dealer.Url + pagedUrl);
                //var node = LoadWebSiteScrapySharp(dealer.Url + pagedUrl);
                HtmlDocument doc = LoadWebsite(pagedUrl);

                HtmlNodeCollection rows = null;
                foreach (var rowSelector in selector.GetRowSelectors())
                {
                    rows = doc?.DocumentNode.SelectNodes(rowSelector);
                    if (rows != null)
                    {
                        break;
                    }
                }

                if (rows != null)
                {
                    rows.ToList().ForEach(row =>
                    {
                        var carInfo        = selector.ParseHtmlIntoCarInfo(row, dealer);
                        carInfo.WebSite    = dealer.Url;
                        carInfo.DealerName = dealer.Name;

                        var map = selector.GetCleanupMap();
                        if (map != null)
                        {
                            map.ForEach(e => { carInfo.GetType().GetProperty(e.Item1).SetValue(carInfo, carInfo.GetType().GetProperty(e.Item1).GetValue(carInfo)?.ToString().Replace(e.Item2, "").Trim()); });
                        }

                        var regexMap = selector.GetRegexMap();
                        if (regexMap != null)
                        {
                            regexMap.ForEach(a =>
                            {
                                if (carInfo.GetType().GetProperty(a.Item1).GetValue(carInfo) != null)
                                {
                                    carInfo.GetType().GetProperty(a.Item1).SetValue(carInfo, a.Item2.Replace(carInfo.GetType().GetProperty(a.Item1).GetValue(carInfo)?.ToString(), " ").Trim());
                                }
                            });
                        }

                        result.Add(carInfo);
                    });
                }

#if DEBUG
                NLogger.Instance.Info(string.Format("Finished scrape for URL {0}, {1} cars. ({2} ms)", pagedUrl, result.GroupBy(a => a.VIN).Select(a => a.First()).Count(), (DateTime.Now - s).TotalMilliseconds));
#endif
            }

            return(result);
        }
示例#12
0
        public void SetInfoFromSections(HtmlNodeCollection sections)
        {
            var wordsToCheck = new string[0];
            //copy
            var sectionsClone = sections.ToList();

            //position
            wordsToCheck = new string[] { "біографія" };
            for (int i = 0; i < sectionsClone.Count; i++)
            {
                var label = LecturerPageCriteria.GetLowerLabelFromSection(sectionsClone[i]);

                if (Contains(label, wordsToCheck))
                {
                    Biography = LecturerPageCriteria.GetInnerTextFromDivInSection(sectionsClone[i]);
                    sectionsClone.RemoveAt(i);
                    break;
                }
            }
            //academic rank
            wordsToCheck = new string[] { "наукові інтереси" };
            for (int i = 0; i < sectionsClone.Count; i++)
            {
                var label = LecturerPageCriteria.GetLowerLabelFromSection(sectionsClone[i]);

                if (Contains(label, wordsToCheck))
                {
                    ScientificInterests = LecturerPageCriteria.GetInnerTextFromDivInSection(sectionsClone[i]);
                    sectionsClone.RemoveAt(i);
                    break;
                }
            }
            //
        }
示例#13
0
        //Adapted from https://stackoverflow.com/questions/43364856/get-web-page-using-htmlagilitypack-netcore
        private async Task <List <HtmlNode> > GetResultNodes(string matchDaysResultsURL)
        {
            HttpClient client = new HttpClient();

            using (HttpResponseMessage response = await client.GetAsync(matchDaysResultsURL))
            {
                using (HttpContent content = response.Content)
                {
                    byte[] contentBytes = await content.ReadAsByteArrayAsync();

                    string       contentString = Encoding.GetEncoding(ENCODING).GetString(contentBytes);
                    HtmlDocument doc           = new HtmlDocument();
                    doc.LoadHtml(contentString);
                    if (!doc.DocumentNode.SelectSingleNode("//title").InnerText.Contains("404"))
                    {
                        HtmlNodeCollection nodes = doc.DocumentNode.SelectNodes("//tr[@onclick]");
                        if (nodes != null)
                        {
                            return(nodes.ToList());
                        }
                        else
                        {
                            return(doc.DocumentNode.SelectNodes("//tr[@class]").Where(n => n.Attributes["class"].Value.Equals("nolink")).ToList());
                        }
                    }
                    else
                    {
                        return(new List <HtmlNode>());
                    }
                }
            }
        }
示例#14
0
 private static List <HtmlNode> toList(HtmlNodeCollection coll)
 {
     if (coll == null)
     {
         return(new List <HtmlNode>());
     }
     return(coll.ToList());
 }
示例#15
0
        private List <Skater> GetSkaters()
        {
            var skaters = _names.ToList()
                          .Select(GetPlayerStats)
                          .Where(x => x.Count == GetHeaderFields().Count)
                          .Select(x => new Skater(x));

            return(skaters.ToList());
        }
        // Return's a list of HtmlNodes representing every <a> tag in the html page source.
        public static List <HtmlNode> GetAllLinks(string pageSource)
        {
            HtmlDocument doc = new HtmlDocument();

            doc.LoadHtml(pageSource);
            HtmlNodeCollection allLinks = doc.DocumentNode.SelectNodes("//a[@href]");

            return(allLinks.ToList());
        }
示例#17
0
        private List <HtmlNode> GetListOfNodes(String html)
        {
            //return list every node that has occured in html string
            HtmlDocument htmlDocument = new HtmlDocument();

            htmlDocument.LoadHtml(html);
            HtmlNodeCollection htmlNodes = htmlDocument.DocumentNode.SelectNodes("//*");

            return(htmlNodes.ToList());
        }
示例#18
0
        public static IEnumerable <HtmlNode> Get(string html, string xpath)
        {
            HtmlNodeCollection nodes = GetDocumentNode(html).SelectNodes(xpath);

            if (nodes != null)
            {
                return(nodes.ToList());
            }

            return(new HtmlNode[0]);
        }
示例#19
0
        public void SetInfoFromRows(HtmlNodeCollection infoRows)
        {
            var wordsToCheck = new string[0];
            //copy
            var infoRowsClone = infoRows.ToList();

            //position
            wordsToCheck = new string[] { "посада" };
            for (int i = 0; i < infoRowsClone.Count; i++)
            {
                var label = LecturerPageCriteria.GetLowerLabelFromRow(infoRowsClone[i]);

                if (Contains(label, wordsToCheck))
                {
                    this.Position = LecturerPageCriteria.GetValueInnerTextFromRow(infoRowsClone[i]);
                    infoRowsClone.RemoveAt(i);
                    break;
                }
            }
            //academic rank
            wordsToCheck = new string[] { "звання" };
            for (int i = 0; i < infoRowsClone.Count; i++)
            {
                var label = LecturerPageCriteria.GetLowerLabelFromRow(infoRowsClone[i]);

                if (Contains(label, wordsToCheck))
                {
                    this.AcademicRank = LecturerPageCriteria.GetValueInnerTextFromRow(infoRowsClone[i]);
                    infoRowsClone.RemoveAt(i);
                    break;
                }
            }
            //contacts
            wordsToCheck  = new string[] { "профіль", "пошта" };
            this.Contacts = "";
            for (int i = 0; i < infoRowsClone.Count; i++)
            {
                var label = LecturerPageCriteria.GetLowerLabelFromRow(infoRowsClone[i]);

                if (Contains(label, wordsToCheck))
                {
                    var contact = LecturerPageCriteria.GetValueLinkFromRow(infoRowsClone[i]);
                    this.Contacts = string.Concat(Contacts, " ", contact);
                    infoRowsClone.RemoveAt(i);
                }
            }

            //

            //

            //
        }
示例#20
0
        private static Dictionary <string, List <HtmlNode> > GetXPathNodeDict(HtmlNode htmlNode, List <XPathMatch> xPathMatchList, int nodeCount)
        {
            Dictionary <string, List <HtmlNode> > nodeDict = new Dictionary <string, List <HtmlNode> >();

            foreach (XPathMatch matchItem in xPathMatchList)
            {
                HtmlNodeCollection htmlNodeList = htmlNode.SelectNodes(matchItem.XPath);
                if (htmlNodeList != null && htmlNodeList.Count == nodeCount)
                {
                    nodeDict.Add(matchItem.PropertyName, htmlNodeList.ToList());
                }
            }
            return(nodeDict);
        }
示例#21
0
        public static IEnumerable <OutlineNode> ToOutlineNodes(this HtmlNodeCollection collection, bool exceptEmptyTags = false, bool disablePosition = true)
        {
            var prepareCollection = exceptEmptyTags ? collection.Where(x => Clear(x.InnerText) != string.Empty) :
                                    collection.ToList();

            for (int i = 0; i < prepareCollection.Count(); i++)
            {
                yield return(new OutlineNode
                {
                    Position = disablePosition ? -1 : i,
                    TagName = prepareCollection.ElementAt(i).Name,
                    InnerText = Clear(prepareCollection.ElementAt(i).InnerText)
                });
            }
        }
示例#22
0
        private List <HtmlNode> GetProductContainers(string page)
        {
            var nodes   = new List <HtmlNode>();
            var htmlDoc = new HtmlDocument();

            htmlDoc.LoadHtml(page);
            HtmlNode list = null;

            foreach (var node in htmlDoc.DocumentNode.SelectNodes("//ul[@class='" + "products-grid category-products-grid product-items-list" + "']"))
            {
                list = node;
            }
            HtmlNodeCollection childNodes = list.ChildNodes;

            return(childNodes.ToList());
        }
        // Returns a list of all links to Blackboard LearningUnits
        // Return null if none found
        // To be used on a BbContentDirectory's page
        // Learning units are similar to folders/directories, but structured differently
        public static List <HtmlNode> GetLearningUnitLinks(string pageSource)
        {
            HtmlDocument doc = new HtmlDocument();

            doc.LoadHtml(pageSource);
            // XPath finds links that have "displayLearningUnit" in their href.
            HtmlNodeCollection contentLinks = doc.DocumentNode.SelectNodes("//a[contains(@href,'displayLearningUnit')]");

            if (contentLinks != null)
            {
                return(contentLinks.ToList());
            }
            else
            {
                return(null);
            }
        }
        // Returns a list of all content links in the page source. Content links could either be folders or files
        // Return null if none found
        // To be used on a BbContentDirectory's page
        public static List <HtmlNode> GetContentLinks(string pageSource)
        {
            HtmlDocument doc = new HtmlDocument();

            doc.LoadHtml(pageSource);
            // XPath finds links in <li> tags with id contentListItem. Excludes links with uploadAssignment? (assignment submission links)
            HtmlNodeCollection contentLinks = doc.DocumentNode.SelectNodes("//li[contains(@id, 'contentListItem')]//a[@href and not(contains(@href,'uploadAssignment?'))]");

            if (contentLinks != null)
            {
                return(contentLinks.ToList());
            }
            else
            {
                return(null);
            }
        }
示例#25
0
        private static List <CrawlItem> GetDiffNodes(HtmlDocument doc2, string root, bool isAttrEnabled,
                                                     IEnumerable <CrawlItem> exists = null, int minNodeCount = 2)
        {
            HtmlNodeCollection nodes = null;
            var crawlItems           = new List <CrawlItem>();

            try
            {
                nodes = doc2.DocumentNode.SelectNodes(root);
            }
            catch (Exception ex)
            {
                XLogSys.Print.Error(ex.Message + "  可能XPath表达式有误");
                return(new List <CrawlItem>());
            }

            if (nodes == null || nodes.Count < minNodeCount)
            {
                return(new List <CrawlItem>());
            }
            var buffers = new List <List <string> >();
            var nodes3  = nodes.ToList(); // .Where(d => d.Name.Contains("#") == false).ToList();

            if (nodes3.Count > 1)
            {
                GetDiffNodes(nodes3, crawlItems, buffers, isAttrEnabled);
            }
            if (exists != null)
            {
                var copied = exists.Select(d =>
                {
                    var xp = new CrawlItem();
                    d.DictCopyTo(xp);
                    return(xp);
                }).ToList();
                crawlItems.RemoveElementsNoReturn(d => copied.Any(r => IsSameXPath(d.XPath, r.XPath, root)));
                crawlItems.AddRange(copied);
            }

            return(crawlItems);
        }
示例#26
0
        private static List <CrawlItem> GetDiffNodes(HtmlDocument doc2, string shortv, bool isAttrEnabled,
                                                     IEnumerable <CrawlItem> exists = null)
        {
            HtmlNodeCollection nodes      = null;
            List <CrawlItem>   crawlItems = new List <CrawlItem>();

            try
            {
                nodes = doc2.DocumentNode.SelectNodes(shortv);
            }
            catch (Exception ex)
            {
                XLogSys.Print.Error(ex.Message + "  可能XPath表达式有误");
                return(new List <CrawlItem>());
            }

            if (nodes == null)
            {
                if (!string.IsNullOrEmpty(shortv))
                {
                    XLogSys.Print.Warn("当前父节点XPath找不到任何节点");
                }

                return(new List <CrawlItem>());
            }

            var buffers = new List <List <string> >();
            var nodes3  = nodes.ToList(); // .Where(d => d.Name.Contains("#") == false).ToList();

            if (nodes3.Count > 1)
            {
                GetDiffNodes(nodes3, crawlItems, buffers, isAttrEnabled);
            }
            if (exists != null)
            {
                crawlItems.RemoveElementsNoReturn(d => exists.Any(r => IsSameXPath(d.XPath, r.XPath, shortv)));
                crawlItems.AddRange(exists);
            }

            return(crawlItems);
        }
示例#27
0
        /// <summary>
        /// 通过XQuery查询到候选结果,再用Validate函数过滤,返回结果
        /// </summary>
        /// <param name="root"></param>
        /// <param name="XQuery"></param>
        /// <param name="ValidateFunction"></param>
        /// <returns></returns>
        public static List <HtmlNode> FilterNodes(HtmlNode root, string XQuery, ValidateNode ValidateFunction = null)
        {
            if (root == null || string.IsNullOrEmpty(XQuery))
            {
                return(null);
            }
            HtmlNodeCollection nodes = root.SelectNodes(XQuery);

            if (nodes == null)
            {
                return(null);
            }
            if (ValidateFunction == null)
            {
                return(nodes.ToList());
            }
            else
            {
                return(nodes.Where(n => ValidateFunction(n)).ToList());
            }
        }
示例#28
0
        private List <string> CheckTextareaFields()
        {
            List <string> errors = new List <string>();

            try{
                HtmlNodeCollection nodes = this.HtmlDoc.DocumentNode.SelectNodes("//textarea");
                if (nodes == null || nodes.Count < 1)
                {
                    errors.Add("Does not contains enough textarea fields.");
                }
                else
                {
                    errors.AddRange(CheckLabels(nodes.ToList(), "textarea"));
                }
            }
            catch (Exception e) {
                errors.Add(string.Format("EXCEPTION: {0}", e.Message));
            }

            return(errors);
        }
        // Retrieves all links to actual files or content items on a LearningUnit content page.
        public static List <HtmlNode> GetLearningUnitContent(string pageSource)
        {
            HtmlDocument doc = new HtmlDocument();

            doc.LoadHtml(pageSource);
            // Two different methods are used in finding learning unit content links. Combine both to get all links.
            // XPath finds all links in the div 'vtbegenerated'
            HtmlNodeCollection contentLinks = doc.DocumentNode.SelectNodes("//div[@class='vtbegenerated']//a[@href]");
            // XPath finds all links in the span 'fnt3'
            HtmlNodeCollection contentLinks2 = doc.DocumentNode.SelectNodes("//span[@class='fnt3']//a[@href]");
            List <HtmlNode>    allLinks      = new List <HtmlNode>();

            if (contentLinks != null)
            {
                allLinks.AddRange(contentLinks.ToList());
            }
            if (contentLinks2 != null)
            {
                allLinks.AddRange(contentLinks2.ToList());
            }
            return(allLinks);
        }
示例#30
0
        private string parseText_Img_Link(HtmlNode node)
        {
            HtmlNode nd = null;

            if (node == null)
            {
                return("");
            }
            while ((nd = node.SelectSingleNode("./hr")) != null)
            {
                try
                {
                    node.RemoveChild(nd);
                }
                catch (ArgumentOutOfRangeException) { break; }
            }
            string             result = node.InnerHtml;
            HtmlNodeCollection hnc    = node.SelectNodes("./img|./input");

            if (hnc != null)
            {
                List <HtmlNode> hnl = hnc.ToList();
                if (hnl != null)
                {
                    foreach (HtmlNode hn in hnl)
                    {
                        string onclick = hn.GetAttributeValue("onclick", "window.open(\"\")");
                        result = result.Replace(hn.OuterHtml,
                                                hn.GetAttributeValue("src", "1")
                                                + hn.GetAttributeValue("value", "")
                                                + onclick.Substring(13, onclick.Length - 15));
                    }
                }
            }
            return("\r\n" + result.Replace("<br>", "\r\n").Replace("</br>", "\r\n").Trim());
        }