HtmlNodeCollection.Where C# (CSharp)代码示例

示例#1

0

显示文件

        private (IEnumerable <HtmlNode>, IEnumerable <HtmlNode>) splitList(HtmlNodeCollection nodes)
        {
            var jmfPrisList             = nodes.Where((x, i) => i % 2 != 0);
            var productManufacturerList = nodes.Where((x, i) => i % 2 == 0);

            return(productManufacturerList, jmfPrisList);
        }

示例#2

0

显示文件

文件： Scrapper.cs 项目： jirkaceska/C_sharp

        private void LoadPlzenskyDvur()
        {
            HtmlNode           doc    = Utils.GetHtmlDoc(Constants.plzenskyDvur).DocumentNode;
            HtmlNode           menu   = doc.SelectSingleNode("//div[@class='listek']/div[@class='tyden']");
            HtmlNodeCollection prices = doc.SelectNodes("//div[@class='listek']/div[@class='tyden_ceny']//td");

            DayMenu ParseDay(HtmlNode title, HtmlNode text)
            {
                string             dateStr = title.InnerText;
                DateTime           date    = Utils.ParseDateTime(dateStr.Split(' ')[1]);
                HtmlNodeCollection rows    = text.SelectNodes("./p");
                string             soup    = rows[0].InnerText;

                Food[] foods = rows
                               .Where((_, index) => index > 0 && index % 2 == 0)
                               .Zip(prices, (HtmlNode food, HtmlNode price) => new Food(
                                        HtmlEntity.DeEntitize(food.InnerText.Trim()),
                                        Utils.ParsePrice(price.InnerText.Split('-')[1].Trim(), ' ')
                                        )).ToArray();

                return(new DayMenu(date, soup, foods));
            }

            HtmlNodeCollection titles = menu.SelectNodes("./p[@class='title']");
            HtmlNodeCollection texts  = menu.SelectNodes("./div[@class='text']");

            titles.Remove(0);
            texts.Remove(0);
            DayMenu[] dayMenus = titles.Zip(texts, ParseDay).ToArray();

            string restaurantName = GetRestaurantName(doc);

            SaveRestaurant(restaurantName, dayMenus, Restaurants.PlzenskyDvur);
        }

示例#3

0

显示文件

文件： Html.cs 项目： FherStk/AutoCheck

        /// <summary>
        /// Returns the label nodes related to the xpath resulting nodes.
        /// </summary>
        /// <param name="root">Root node from where the XPath expression will be evaluated.</param>
        /// <param name="xpath">XPath expression.</param>
        /// <returns>Dictonary with key-pair values, where the key is the main field node, and the value is a set of its related label nodes.</returns>
        public Dictionary <HtmlNode, HtmlNode[]> GetRelatedLabels(HtmlNode root, string xpath)
        {
            var results = new Dictionary <HtmlNode, HtmlNode[]>();

            if (root != null)
            {
                foreach (HtmlNode node in root.SelectNodes(xpath))   //TODO: if none returns, throws a null object exception...
                {
                    string id = node.GetAttributeValue("id", "");
                    if (string.IsNullOrEmpty(id))
                    {
                        results.Add(node, null);
                    }
                    else
                    {
                        HtmlNodeCollection labels = this.HtmlDoc.DocumentNode.SelectNodes("//label");
                        if (labels == null)
                        {
                            results.Add(node, null);
                        }
                        else
                        {
                            results.Add(node, labels.Where(x => x.GetAttributeValue("for", "").Equals(id)).ToArray());
                        }
                    }
                }
            }

            return(results);
        }

示例#4

0

显示文件

        private void GhettoParse()
        {
            // https://stackoverflow.com/questions/37320624/htmlagilitypack-how-to-extract-html-between-some-tag
            //string query = "//node()[preceding-sibling::h2 or self::h2][following-sibling::h2 or self::h2]"; // grabs all h2 and nodes in between but won't grab "content" of last h2 node "Notes:"
            //string query = "//node()[preceding-sibling::h2][following-sibling::h2]"; // will skip first h2 "setup summary" and the last h2 "Notes:"
            //string query = "//node()[preceding-sibling::h2]"; // skips first h2 "setup summary" and doesn't skip last h2 "Notes:", grabs all nodes that have a h2 as

            //List<string> lines = new List<string>();

            foreach (var item in documentNodes.Where(x => x.ParentNode is HtmlNode && !string.IsNullOrEmpty(x.InnerText.Trim())))
            {
                //var tabs = item.XPath.Length > 24 ? "\t" : "\t\t";
                //var line = item.XPath + ";" + item.InnerText.Trim();
                var line = item.XPath;
                // if line = h2
                //   found area
                //   identify area
                //		get id, find Template id
                // if line = text
                //   found property
                // if line = u
                //   found value
                //
                // lines.add(line);
                parsedLines.Add(line);
            }
        }

示例#5

0

显示文件

文件： TVUtils.cs 项目： jmn319/mmb

        //Updated July 15th
        public static void WriteShows()
        {
            try
            {
                HtmlNodeCollection collection = DownloadMgmtUtils.GetHtmlNodeCollection
                                                    (ConfigurationManager.AppSettings["show_url"] +
                                                    ConfigurationManager.AppSettings["show_url_list_postfix"], "//a");

                MongoCollection mongoCollection = MongoUtils.GetMongoCollection
                                                  (
                    @"mongodb://" + ConfigurationManager.AppSettings["mongoHost"] + @"/",
                    ConfigurationManager.AppSettings["port"],
                    ConfigurationManager.AppSettings["db"],
                    ConfigurationManager.AppSettings["show_collection"]
                                                  );

                foreach (
                    HtmlNode link in
                    collection.Where(link => !ConfigurationManager.AppSettings["show_innerhtml_excl"].Split(',')
                                     .Any(s => link.InnerHtml.Contains(s))).Where(link => link.Attributes[0].Value != null))
                {
                    mongoCollection.Insert(new TvShow()
                    {
                        Name = link.InnerHtml, Path = link.Attributes[0].Value
                    });
                }
            }
            catch (Exception e)
            {
                Log.AppendToLog("Error : FATAL Write Show Issue : " + e, ConfigurationManager.AppSettings["log_file"]);
            }
        }

示例#6

0

显示文件

文件： MailingListScrapper.cs 项目： atifrehman/NDN-Mailing-List-Search-App

        private List <string> FetchMailingListMonths(string mailingListURL)
        {
            try
            {
                List <string> urls = new List <string>();

                HtmlWeb            web            = new HtmlWeb();
                HtmlDocument       document       = web.Load(mailingListURL);
                HtmlNodeCollection nodeCollection = document.DocumentNode.SelectSingleNode("//body").ChildNodes;
                List <HtmlNode>    TableList      = nodeCollection.Where(x => x.Name == "table").ToList();
                foreach (var cell in document.DocumentNode.SelectNodes("//table/tr/td"))
                {
                    HtmlNode threadNode = cell.ChildNodes.FirstOrDefault(x => x.Name == "a" && x.InnerText == "[ Thread ]");
                    if (threadNode != null)
                    {
                        string url = threadNode.Attributes["href"].Value;
                        urls.Add(url);
                    }
                }

                // construct full urls
                for (int i = 0; i < urls.Count; i++)
                {
                    urls[i] = mailingListURL + urls[i];
                    Console.WriteLine(urls[i]);
                }

                return(urls);
            }
            catch (Exception)
            {
                throw;
            }
        }

示例#7

0

显示文件

        public static bool UpdateKDRISINReport(KoreaEquityInfo item)
        {
            HtmlNodeCollection records = SearchISIN(item.KoreaName, false, false);

            if (records == null)
            {
                return(false);
            }

            string isin;
            string ticker;

            foreach (HtmlNode n in records.Where(n => n.SelectSingleNode(".//td[4]").InnerText.Trim().Equals("예탁증서")))
            {
                isin   = n.SelectSingleNode(".//td[2]").InnerText.Trim();
                ticker = GetTickerByISIN(isin, 2);

                if (item.Ticker.Equals(ticker))
                {
                    item.ISIN = isin;
                    item.Type = "KDR";

                    return(true);
                }
            }

            return(false);
        }

示例#8

0

显示文件

文件： BrowserAllegroOfferListController.cs 项目： dawidkacprzak/Platinum.Core

        public IEnumerable <string> GetAllOfferLinks()
        {
            string       pageSource = CurrentSiteSource(pageId);
            HtmlDocument document   = new HtmlDocument();

            document.LoadHtml(pageSource);

            var offerContainer = document.DocumentNode.SelectNodes("//div[@id=\"opbox-listing--base\"]");

            if (offerContainer == null || !offerContainer.Any())
            {
                throw new OfferListControllerException("Allegro layout has been changed", this);
            }
            else
            {
                HtmlNodeCollection offerLinks = offerContainer.First().SelectNodes("//a");

                List <HtmlNode> offerLinksNode = offerLinks.Where(x => x.HasAttributes).ToList();
                logger.Info("offerl links: " + offerLinksNode.Count);

                foreach (var offerLink in offerLinksNode)
                {
                    if (offerLink.Attributes["href"] != null &&
                        offerLink.Attributes["href"].Value.Contains("/oferta/") &&
                        offerLink.Attributes["href"].Value.Contains("http"))
                    {
                        yield return(offerLink.Attributes["href"].Value);
                    }
                }
            }
        }

示例#9

0

显示文件

        private List <string> CheckPlaceholders()
        {
            List <string> errors = new List <string>();

            try{
                HtmlNodeCollection nodes = this.HtmlDoc.DocumentNode.SelectNodes("//input");
                if (nodes == null)
                {
                    errors.Add("Unable to find any placeholder.");
                }
                else
                {
                    List <HtmlNode> inputs = nodes.Where(x => !(new[] { "radio", "checkbox", "reset", "submit" }).Contains(x.GetAttributeValue("type", ""))).ToList();

                    nodes = this.HtmlDoc.DocumentNode.SelectNodes("//textarea");
                    if (nodes != null)
                    {
                        inputs.AddRange(nodes.ToList());
                    }

                    if (inputs.Where(x => x.Attributes.Where(y => y.Name == "placeholder").Count() < 1).Count() > 0)
                    {
                        errors.Add("Some fields does not have any defined placeholder.");
                    }
                }
            }
            catch (Exception e) {
                errors.Add(string.Format("EXCEPTION: {0}", e.Message));
            }

            return(errors);
        }

示例#10

0

显示文件

        private List <string> CheckSelectFields()
        {
            List <string> errors = new List <string>();

            try{
                HtmlNodeCollection nodes = this.HtmlDoc.DocumentNode.SelectNodes("//select");
                if (nodes == null || nodes.Count < 1)
                {
                    errors.Add("Does not contains enough select fields.");
                }
                else
                {
                    errors.AddRange(CheckLabels(nodes.ToList(), "select"));
                }

                nodes = this.HtmlDoc.DocumentNode.SelectNodes("//select/option");
                if (nodes == null || nodes.Count < 3)
                {
                    errors.Add("The select field does not contains enough options.");
                }
                else
                {
                    if (nodes.Where(x => x.Attributes.Where(y => y.Name == "selected").Count() > 0).Count() != 1)
                    {
                        errors.Add("The select field does not have a single default option.");
                    }
                }
            }
            catch (Exception e) {
                errors.Add(string.Format("EXCEPTION: {0}", e.Message));
            }

            return(errors);
        }

示例#11

0

显示文件

文件： ChesaningVillageMI.cs 项目： cykb518hu/Scraper

        public void DownloadCouncilPdfFiles()
        {
            var       docs    = this.LoadDocumentsDoneSQL();
            var       queries = this.LoadQueriesDoneSQL();
            WebClient c       = new WebClient();
            HtmlWeb   web     = new HtmlWeb();
            Regex     dateReg = new Regex("(([0-9]{1,2}\\/[0-9]{1,2}\\/[0-9]{4})|((0|1)[0-9]{1}[0-9]{2}[0-9]{4}))");

            foreach (string url in this.docUrls)
            {
                string       category    = url.Split('*')[0];
                string       categoryUrl = url.Split('*')[1];
                string       html        = this.GetHtml(categoryUrl, string.Empty);
                HtmlDocument doc         = new HtmlDocument();
                doc.LoadHtml(html);
                HtmlNode           councilPacketNode = doc.DocumentNode.SelectSingleNode("//*[text()='Council Packet']");
                HtmlNodeCollection fileNodes         = null;

                if (councilPacketNode != null)
                {
                    var ancestorsPacket = councilPacketNode.Ancestors();
                    councilPacketNode = ancestorsPacket.FirstOrDefault(t => t.OriginalName == "table");
                    fileNodes         = councilPacketNode.SelectNodes(".//div[@id='file_name']//a[contains(@href,'.pdf')]");
                }
                else
                {
                    fileNodes = doc.DocumentNode.SelectNodes("//div[@id='RZdocument_center']//a[contains(@href,'.pdf')]");
                }

                if (fileNodes != null)
                {
                    var fileNodesTarget = fileNodes.Where(t => t.SelectSingleNode("./img") == null);
                    foreach (HtmlNode fileNode in fileNodesTarget)
                    {
                        string fileUrl = fileNode.Attributes["href"].Value;
                        fileUrl = !fileUrl.StartsWith("http") ? this.cityEntity.CityUrl + fileUrl : fileUrl;
                        string meetingDateText = dateReg.Match(fileNode.InnerText).ToString();

                        Console.WriteLine("DEBUG: {0}", fileUrl);
                        Console.WriteLine("DEBUG: {0}", fileNode.OuterHtml);
                        Console.WriteLine("DEBUG: meeting date - {0}...", meetingDateText);

                        DateTime meetingDate = meetingDateText.Length == 8 ?
                                               DateTime.ParseExact(meetingDateText, "MMddyyyy", null) :
                                               DateTime.Parse(meetingDateText);

                        if (meetingDate < this.dtStartFrom)
                        {
                            Console.WriteLine("Early, skip...");
                            continue;
                        }

                        this.ExtractADoc(c, fileUrl, category, "pdf", meetingDate, ref docs, ref queries);
                    }
                }
            }
        }

示例#12

0

显示文件

文件： BrowntownCharterTownshipMI.cs 项目： cykb518hu/Scraper

        public void DownloadCouncilPdfFiles()
        {
            List <Documents>   docs    = this.LoadDocumentsDoneSQL();
            List <QueryResult> queries = this.LoadQueriesDoneSQL();
            HtmlWeb            web     = new HtmlWeb();
            WebClient          c       = new WebClient();

            foreach (string url in this.docUrls)
            {
                string       category    = url.Split('*')[0];
                string       categoryUrl = url.Split('*')[1];
                string       baseUrl     = categoryUrl.Replace(categoryUrl.Split('/').LastOrDefault(), string.Empty);
                HtmlDocument doc         = web.Load(categoryUrl);

                HtmlNodeCollection docNodes = doc.DocumentNode.SelectNodes("//div[@class='center_body_text center_scroller']//table//tr/td");

                if (docNodes != null)
                {
                    for (int i = this.dtStartFrom.Year; i <= DateTime.Now.Year; i++)
                    {
                        List <HtmlNode> entries = docNodes.Where(t =>
                                                                 t.SelectSingleNode("./a[@href]") != null &&
                                                                 t.SelectSingleNode("./a[@href]").Attributes["href"].Value.StartsWith(i.ToString()))
                                                  .ToList();

                        foreach (HtmlNode entryNode in entries)
                        {
                            string   meetingDateText = string.Format("{0}, {1}", HttpUtility.HtmlDecode(entryNode.InnerText.Replace("\n", string.Empty).Split('(').FirstOrDefault()), i);
                            HtmlNode entryUrlNode    = entryNode.SelectSingleNode("./a");
#if debug
                            try
                            {
                                DateTime.Parse(meetingDateText);
                                Console.WriteLine("No problem...");
                                continue;
                            }
                            catch
                            {
                                Console.WriteLine("Not match: {0} on {1}...", meetingDateText, categoryUrl);
                                continue;
                            }
#endif

                            DateTime meetingDate = DateTime.Parse(meetingDateText);
                            if (meetingDate < this.dtStartFrom)
                            {
                                Console.WriteLine("Too early, skip...");
                                continue;
                            }
                            string docUrl = string.Format("{0}{1}", baseUrl, entryUrlNode.Attributes["href"].Value);
                            this.ExtractADoc(c, docUrl, category, "pdf", meetingDate, ref docs, ref queries);
                        }
                    }
                }
            }
        }

示例#13

0

显示文件

文件： HTMLParser.cs 项目： witalloliveira/Blackboard-Downloader

        // Return's a list of HtmlNodes representing every <a> tag linking to a Module
        // Used to determine all modules the user has access to and populate their content
        public static List <HtmlNode> GetModuleLinks(string pageSource)
        {
            HtmlDocument doc = new HtmlDocument();

            doc.LoadHtml(pageSource);
            HtmlNodeCollection allLinks = doc.DocumentNode.SelectNodes("//a[@href]");
            // Module links have "type=Course" in the href tag
            List <HtmlNode> moduleLinks = allLinks.Where(item => item.Attributes["href"].Value.Contains("type=Course")).ToList();

            return(moduleLinks);
        }

示例#14

0

显示文件

        public static string GetOg(string url)
        {
            //Handle edge cases
            if (url.Contains("mobile.twitter.com"))
            {
                url = url.Replace("mobile.", String.Empty);
            }

            if (url.Contains("tumblr.com/image/"))
            {
                url = url.Replace("/image/", "/post/");
            }

            string       resultUrl = "";
            string       html      = FetchHtml(url);
            HtmlDocument doc       = new HtmlDocument();

            doc.LoadHtml(html);

            if (url.Contains("reddit.com/gallery/"))
            {
                var images   = doc.DocumentNode.SelectNodes("//img").Where(x => x.Attributes["src"]?.Value != null);
                var imageUrl = images.FirstOrDefault(x => x.Attributes["src"].Value.StartsWith("https://preview.redd.it/"))?.Attributes["src"].Value;
                return(imageUrl);
            }

            HtmlNodeCollection list = doc.DocumentNode.SelectNodes("//meta");

            if (list == null)
            {
                return(string.Empty);
            }
            try
            {
                List <HtmlNode> ogImageNodes = list
                                               .Where(x => x.Attributes["property"]?.Value == "og:image" || x.Attributes["name"]?.Value == "twitter:image").ToList();
                //Prefer any format vs gif
                var first = ogImageNodes.FirstOrDefault(x =>
                                                        x.Attributes["content"].Value.EndsWith(".jpg") ||
                                                        x.Attributes["content"].Value.EndsWith(".jpg?play") || //Gifv
                                                        x.Attributes["content"].Value.EndsWith(".jpeg") ||
                                                        x.Attributes["content"].Value.EndsWith(".png")
                                                        );
                resultUrl = first != null
                    ? first.Attributes["content"].Value.TrimEnd("?play")
                    : ogImageNodes.First().Attributes["content"].Value.TrimEnd("?play");
            }
            catch (Exception)
            {
                // ignored
            }

            return(resultUrl);
        }

示例#15

0

显示文件

文件： clsCrickBuzzData.cs 项目： kishorasm123/Live-Cricket-2.0

        private static DataTable GetPlayerCarrierTable(HtmlNodeCollection i_objHtmlTableClassNodes, DatableSelection i_DataTableRequest)
        {
            // Local variables
            DataTable objDataTable = null;
            HtmlNode  objHtmlNode  = null;

            try
            {
                if (i_DataTableRequest == DatableSelection.Batting)
                {
                    objHtmlNode = i_objHtmlTableClassNodes.Where(x => x.InnerText.Contains("Batting Career Summary")).First().SelectNodes(".//table").First();
                }
                else if (i_DataTableRequest == DatableSelection.Bowling)
                {
                    objHtmlNode = i_objHtmlTableClassNodes.Where(x => x.InnerText.Contains("Bowling Career Summary")).First().SelectNodes(".//table").First();
                }

                objDataTable = new DataTable();

                // Creating columns to datatable
                var headers = objHtmlNode.SelectNodes(".//tr").First().SelectNodes(".//th").Select(x => x.InnerText);
                foreach (var header in headers)
                {
                    objDataTable.Columns.Add(header);
                }

                // Adding values to datatable
                var rows = objHtmlNode.SelectNodes(".//tr").Skip(1).Select(tr => tr.Elements("td").Select(td => td.InnerText.Trim()).ToArray());
                foreach (var row in rows)
                {
                    objDataTable.Rows.Add(row);
                }

                return(objDataTable);
            }
            catch (Exception ex)
            {
                return(null);
            }
        }

示例#16

0

显示文件

文件： Program.cs 项目： peroxy/FoodParser

        public static string GetFavolaMenu()
        {
            var doc = new HtmlDocument
            {
                OptionFixNestedTags         = true,
                OptionDefaultStreamEncoding = Encoding.GetEncoding("windows-1250"),
                OptionAutoCloseOnEnd        = true
            };

            string htmlString;

            using (var client = new WebClient())
            {
                client.Encoding = Encoding.GetEncoding("windows-1250");
                htmlString      = client.DownloadString("http://www.kaval-group.si/FAVOLA,,ponudba/kosila");
            }

            doc.LoadHtml(htmlString);

            HtmlNodeCollection results = doc.DocumentNode.SelectNodes(string.Format("//*[contains(@class,'{0}')]", "childNaviLiElement"));

            foreach (HtmlNode item in results.Where(item => item.FirstChild.Attributes["onclick"] != null))
            {
                if (item.InnerText.Contains(string.Format("{0}", DateTime.Now.ToString("d.M.yyyy"))) ||
                    item.InnerText.Contains(string.Format("{0}", DateTime.Now.ToString("d.MM.yyyy"))) ||
                    item.InnerText.Contains(string.Format("{0}", DateTime.Now.ToString("dd.M.yyyy"))) ||
                    item.InnerText.Contains(string.Format("{0}", DateTime.Now.ToString("dd.MM.yyyy"))))
                {
                    //PrintHtml(item);

                    string   className  = string.Format("show show-{0}", item.FirstChild.Attributes["class"].Value.Split('-').LastOrDefault());
                    HtmlNode activeMenu = doc.DocumentNode.SelectSingleNode(string.Format("//*[contains(@class,'{0}')]", className));
                    var      sb         = new StringBuilder("<b>Favola:</b>" + Environment.NewLine + Environment.NewLine);
                    foreach (
                        HtmlNode childNode in activeMenu.ChildNodes.Where(x => x.Name == "p"))
                    {
                        if (!childNode.InnerText.Contains("***"))
                        {
                            sb.AppendLine(string.Format("<li>{0}</li>", childNode.InnerText));
                        }
                    }

                    return(sb.ToString());
                    //PrintHtml(activeMenu);
                }
            }

            return("");
        }

示例#17

0

显示文件

文件： Extensions.cs 项目： stereoappa/HtmlComparer

        public static IEnumerable <OutlineNode> ToOutlineNodes(this HtmlNodeCollection collection, bool exceptEmptyTags = false, bool disablePosition = true)
        {
            var prepareCollection = exceptEmptyTags ? collection.Where(x => Clear(x.InnerText) != string.Empty) :
                                    collection.ToList();

            for (int i = 0; i < prepareCollection.Count(); i++)
            {
                yield return(new OutlineNode
                {
                    Position = disablePosition ? -1 : i,
                    TagName = prepareCollection.ElementAt(i).Name,
                    InnerText = Clear(prepareCollection.ElementAt(i).InnerText)
                });
            }
        }

示例#18

0

显示文件

文件： DriverService.cs 项目： yefan-paskarcastan/AyanaWebApi

        /// <summary>
        /// Подготавливает описание
        /// </summary>
        /// <param name="post"></param>
        /// <returns></returns>
        string FormatDescriptionRutor(string desc, string poster)
        {
            HtmlDocument htmlDocument = new HtmlDocument();

            htmlDocument.LoadHtml(desc);

            HtmlNode           htmlNode        = htmlDocument.DocumentNode;
            HtmlNodeCollection nodesScrenshots = htmlNode.SelectNodes(@"//img[parent::a]");

            if (nodesScrenshots != null)
            {
                foreach (var item in nodesScrenshots)
                {
                    item.Remove();
                }
            }

            HtmlNodeCollection nodesImgs = htmlNode.SelectNodes(@"//img");

            if (nodesImgs != null && nodesImgs.Count == 2)
            {
                var item = nodesImgs.Where(el => el.GetAttributeValue("src", null)
                                           .Contains(Path.GetFileName(poster)))
                           .SingleOrDefault();
                item?.Remove();
            }
            else
            {
                foreach (var item in nodesImgs)
                {
                    item.Remove();
                }
            }

            string description = htmlNode.OuterHtml.Replace("<div></div>", "");

            description = description.Replace("<hr>", "");
            description = description.Replace("<br>", "");

            while (description.Contains(Environment.NewLine + Environment.NewLine))
            {
                description = description.Replace(Environment.NewLine + Environment.NewLine, Environment.NewLine);
            }
            return(description);
        }

示例#19

0

显示文件

文件： CrockeryTownshipMI.cs 项目： cykb518hu/Scraper

        public void DownloadCouncilPdfFiles()
        {
            List <Documents>   docs    = this.LoadDocumentsDoneSQL();
            List <QueryResult> queries = this.LoadQueriesDoneSQL();
            HtmlWeb            web     = new HtmlWeb();
            WebClient          c       = new WebClient();
            Regex dateReg = new Regex("[a-zA-Z]+[\\s]{0,1}[0-9]{1,2},[\\s]{0,1}[0-9]{4}");

            foreach (string url in this.docUrls)
            {
                HtmlDocument       meetingHomeDoc = web.Load(url);
                HtmlNodeCollection fileNodes      = meetingHomeDoc.DocumentNode.SelectNodes("//a[contains(@href,'.pdf')]");

                for (int year = this.dtStartFrom.Year; year <= DateTime.Now.Year; year++)
                {
                    var targetNodes = fileNodes.Where(t => t.OuterHtml.Contains(year.ToString()));

                    if (targetNodes != null)
                    {
                        foreach (HtmlNode fileNode in targetNodes)
                        {
                            string nodeUrl = fileNode.Attributes["href"].Value;
                            nodeUrl = !nodeUrl.StartsWith("http") ? this.cityEntity.CityUrl + nodeUrl : nodeUrl;
                            DateTime meetingDate     = DateTime.MinValue;
                            string   meetingDateText = dateReg.Match(nodeUrl).ToString();

                            if (!string.IsNullOrEmpty(meetingDateText))
                            {
                                meetingDate = DateTime.Parse(meetingDateText);
                            }
                            if (meetingDate < this.dtStartFrom)
                            {
                                Console.WriteLine("Too early, skip...");
                                continue;
                            }
                            string category = nodeUrl.Contains("PC") || nodeUrl.Contains("Plan") ? "Planning" : "City Council";
                            this.ExtractADoc(c, nodeUrl, category, "pdf", meetingDate, ref docs, ref queries);
                        }
                    }
                }
            }
        }

示例#20

0

显示文件

文件： TVUtils.cs 项目： jmn319/mmb

        //Updated July 30th - Combined with WriteEpisodes
        public static void RefreshEpisodes()
        {
            try
            {
                MongoCollection mongoCollection = MongoUtils.GetMongoCollection
                                                  (
                    @"mongodb://" + ConfigurationManager.AppSettings["mongoHost"] + @"/",
                    ConfigurationManager.AppSettings["port"],
                    ConfigurationManager.AppSettings["db"],
                    ConfigurationManager.AppSettings["show_collection"]
                                                  );

                foreach (var show in mongoCollection.FindAllAs <TvShow>().ToList <TvShow>())
                {
                    if (!ConfigurationManager.AppSettings["show_excl"].Split(',').Any(s => show.Name.Contains(s)))
                    {
                        HtmlNodeCollection collection =
                            DownloadMgmtUtils.GetHtmlNodeCollection(
                                ConfigurationManager.AppSettings["show_url"] + show.Path, "//a");

                        if (collection != null)
                        {
                            foreach (var link in collection.Where(link =>
                                                                  !ConfigurationManager.AppSettings["episode_innerhtml_excl"].Split(',')
                                                                  .Any(s => link.InnerHtml.Contains(s)) &&
                                                                  link.Attributes.Count > 0 && !link.Attributes[0].Value.Contains("tvnews") &&
                                                                  link.Attributes[HrefIndex(link)].Value.Contains("/ep")))
                            {
                                UpdateShowFromHtml(show, link.InnerHtml, collection, collection.IndexOf(link));
                            }
                        }

                        mongoCollection.Save(show);
                    }
                }
            }
            catch (Exception e)
            {
                Log.AppendToLog("Error : FATAL Refresh Episodes Issue : " + e,
                                ConfigurationManager.AppSettings["log_file"]);
            }
        }

示例#21

0

显示文件

文件： PlayerDataStore.cs 项目： bearandhammer/portfolio

        /// <summary>
        /// Static helper method that tears apart the provided HtmlDocument to construct
        /// a collection of WtaPlayer objects.
        /// </summary>
        /// <param name="wtaPlayerHtmlDocument">The HtmlDocument that contains the WTA player data.</param>
        /// <returns>An IEnumerable of type <see cref="WtaPlayer"/>.</returns>
        private static IEnumerable <WtaPlayer> GetPlayersFromHtmlDocument(HtmlDocument wtaPlayerHtmlDocument)
        {
            // Work with the returned HTML data - first step is to identify the table rows (only a single table on the page at
            // the time of producing this code sample). We only want every other 'tr', as these are the only ones that contain data, also
            // keeping in mind that spacer 'tr' elements exist (without a 'td' Count of 14, so also remove these elements)
            HtmlNodeCollection tbodyRowNodes = wtaPlayerHtmlDocument.DocumentNode.SelectNodes("//tbody/tr");

            IEnumerable <HtmlNode> everyOtherNode = tbodyRowNodes
                                                    .Where((node, index) => index % 2 == 0 && node?.ChildNodes?.Count == 14)
                                                    .Take(100);

            // Setup a regex to clean up the rank information
            Regex rankCleanerRegex = new Regex(@"<[^>]+>|&nbsp;");

            // Construct and return WtaPlayer objects based on some very (fixed, agreed!) ripping of text from td elements
            return(everyOtherNode.Select(node =>
                                         new WtaPlayer(int.Parse(rankCleanerRegex.Replace(node.ChildNodes[0].InnerText, string.Empty).Trim()),
                                                       node.ChildNodes[3].InnerText.Trim(),
                                                       int.Parse(node.ChildNodes[6].InnerText.Trim()))));
        }

示例#22

0

显示文件

 /// <summary>
 /// 获取分页
 /// </summary>
 private void GetNextPage(string url, HtmlNodeCollection aNodes)
 {
     if (!string.IsNullOrEmpty(url) && aNodes != null)
     {
         var hitANode = aNodes.Where(c => c.InnerText.Contains("末页")).FirstOrDefault();
         if (hitANode != null)
         {
             //https://xm.focus.cn/loupan/p38/?saleStatus=6
             var hrefAttr = hitANode.Attributes["href"];
             if (hrefAttr != null)
             {
                 var findUrl    = hrefAttr.Value;
                 var getPageNum = Toolslib.Str.Sub(findUrl, "loupan/p", "/");
                 var pageNum    = 0;
                 if (int.TryParse(getPageNum, out pageNum))
                 {
                     // UrlQueue.Instance.EnQueue(new UrlInfo(url) { Depth = 1 });//第一页
                     var oldPageText = string.Format("/loupan/p{0}", getPageNum);
                     //获取当前页数
                     for (var i = 2; i <= pageNum; i++)
                     {
                         //url https://xm.focus.cn/loupan/?saleStatus=6
                         var newPageText = string.Format("/loupan/p{0}", i);
                         var resultUrl   = findUrl.Replace(oldPageText, newPageText);
                         if (!filter.Contains(resultUrl))
                         {
                             UrlQueue.Instance.EnQueue(new UrlInfo(resultUrl)
                             {
                                 Depth = 1
                             });
                         }
                     }
                 }
                 else
                 {
                     Console.WriteLine("无法获取页数");
                 }
             }
         }
     }
 }

示例#23

0

显示文件

        private List <string> CheckReset()
        {
            List <string> errors = new List <string>();

            try{
                HtmlNodeCollection nodes = this.HtmlDoc.DocumentNode.SelectNodes("//input"); //TODO: also button is alowed
                if (nodes == null)
                {
                    errors.Add("Does not contains any reset button.");
                }
                else if (nodes.Where(x => x.GetAttributeValue("type", "").Equals("reset")).Count() < 1)
                {
                    errors.Add("Does not contains any reset button.");
                }
            }
            catch (Exception e) {
                errors.Add(string.Format("EXCEPTION: {0}", e.Message));
            }

            return(errors);
        }

示例#24

0

显示文件

        /// <summary>
        /// 通过XQuery查询到候选结果，再用Validate函数过滤，返回结果
        /// </summary>
        /// <param name="root"></param>
        /// <param name="XQuery"></param>
        /// <param name="ValidateFunction"></param>
        /// <returns></returns>
        public static List <HtmlNode> FilterNodes(HtmlNode root, string XQuery, ValidateNode ValidateFunction = null)
        {
            if (root == null || string.IsNullOrEmpty(XQuery))
            {
                return(null);
            }
            HtmlNodeCollection nodes = root.SelectNodes(XQuery);

            if (nodes == null)
            {
                return(null);
            }
            if (ValidateFunction == null)
            {
                return(nodes.ToList());
            }
            else
            {
                return(nodes.Where(n => ValidateFunction(n)).ToList());
            }
        }

示例#25

0

显示文件

 public static int[] GetLinkPages(HtmlDocument doc)
 {
     if (doc != null)
     {
         Column             column       = new Column();
         string             pagePath     = "//td[@colspan=\"12\"][@align=\"center\"][1]/text()";
         HtmlNodeCollection categoryList = doc.DocumentNode.SelectNodes(pagePath);
         if (categoryList == null)
         {
             pagePath     = "//td[@colspan=\"13\"][@align=\"center\"][1]/text()";
             categoryList = doc.DocumentNode.SelectNodes(pagePath);
         }
         if (categoryList != null)
         {
             HtmlNode[] innerText = categoryList.Where(x => x.InnerText.Contains("共")).ToArray();
             if (innerText.Length > 0)
             {
                 string   result = System.Text.RegularExpressions.Regex.Replace(innerText[0].InnerText, @"[^0-9]+", ",");
                 string[] args   = result.Split(',');
                 args = args.Where(x => !string.IsNullOrEmpty(x)).ToArray();
                 int[] info = new int[2];
                 info[0] = Convert.ToInt32(args[0]);
                 info[1] = Convert.ToInt32(args[1]);
                 return(info);
             }
             else
             {
                 return(null);
             }
         }
         else
         {
             return(null);
         }
     }
     else
     {
         return(null);
     }
 }

示例#26

0

显示文件

        private List <string> CheckInputFields(string type, int min)
        {
            List <string> errors = new List <string>();

            try{
                HtmlNodeCollection nodes = this.HtmlDoc.DocumentNode.SelectNodes("//input");
                if (nodes == null)
                {
                    errors.Add(string.Format("Does not contains any {0} fields.", type));
                }
                else
                {
                    //TODO: get the nodes using XPath... I can't get the correct one, maybe a bug? //input[@type='text']
                    //TODO: solved in Css3Validator (method CheckCssProperty)
                    List <HtmlNode> filtered = nodes.Where(x => x.GetAttributeValue("type", "").Equals(type)).ToList();
                    if (filtered.Count() < min)
                    {
                        errors.Add(string.Format("Does not contains enough {0} fields.", type));
                    }
                    else if (type == "radio" || type == "checkbox")
                    {
                        if (filtered.GroupBy(x => x.GetAttributeValue("name", "")).Count() > 1)
                        {
                            errors.Add(string.Format("The {0} fields does not share the same name.", type));
                        }
                        if (filtered.Where(x => x.Attributes.Where(y => y.Name == "checked").Count() > 0).Count() != 1)
                        {
                            errors.Add(string.Format("The {0} fields does not have a single default value.", type));
                        }
                    }

                    errors.AddRange(CheckLabels(filtered, type));
                }
            }
            catch (Exception e) {
                errors.Add(string.Format("EXCEPTION: {0}", e.Message));
            }

            return(errors);
        }

示例#27

0

显示文件

        /// <summary>
        /// We use XPath expressions to extract all available links from the article, making sure that the links are taken only from the main content section where the article is present.
        /// </summary>
        public List <Link> GetLinks()
        {
            HtmlNode           mainContent = WikipediaUtility.GetMainContent(website);
            HtmlNodeCollection links       = mainContent.SelectNodes("//a[starts-with(@href,'/wiki/')]");

            return(links
                   .Where(n => n.GetAttributeValue("href", null) != null)
                   .Select(n => n.Attributes["href"].Value)
                   .Distinct()
                   .Where(urll =>
            {
                string urllow = urll.ToLower();
                return !urllow.StartsWith("/wiki/file:") &&
                !urllow.StartsWith("/wiki/template:") &&
                !urllow.StartsWith("/wiki/special:");
            })
                   .Select(urll => new Link()
            {
                URL = urll
            })
                   .ToList());
        }

示例#28

0

显示文件

文件： MailingListScrapper.cs 项目： atifrehman/NDN-Mailing-List-Search-App

        public List <string> FetchMontlyURLs(string monthURL)
        {
            try
            {
                List <string> urls = new List <string>();

                HtmlWeb            web            = new HtmlWeb();
                HtmlDocument       document       = web.Load(monthURL);
                HtmlNodeCollection nodeCollection = document.DocumentNode.SelectSingleNode("//body").ChildNodes;
                List <HtmlNode>    allUlList      = nodeCollection.Where(x => x.Name == "ul").ToList();
                if (allUlList.Count > 1)
                {
                    HtmlNode urlsULNode = allUlList[1]; // fetching ul
                    foreach (var liItem in urlsULNode.ChildNodes.Where(x => x.Name == "li").ToList())
                    {
                        HtmlNode urlNode = liItem;
                        urls.Add(liItem.FirstChild.Attributes["href"].Value);
                        if (liItem.ChildNodes.FirstOrDefault(x => x.Name == "ul") != null)
                        {
                            RecursiveFetchMontlyURL(liItem, urls);
                        }
                    }
                }

                // construct full urls
                for (int i = 0; i < urls.Count; i++)
                {
                    urls[i] = monthURL.Substring(0, monthURL.LastIndexOf('/') + 1) + urls[i];
                    Console.WriteLine(urls[i]);
                }


                return(urls);
            }
            catch (Exception ex)
            {
                throw;
            }
        }

示例#29

0

显示文件

文件： MiguMusicApi.cs 项目： Executor-Cheng/MiguMusic_DGJModule

        public static SongInfo[] GetPlaylist(long id)
        {
            string html = HttpHelper.HttpGet($"http://music.migu.cn/v3/music/playlist/{id}", headers: DefaultHeaders);

            try
            {
                HtmlDocument document = new HtmlDocument();
                document.LoadHtml(html);
                HtmlNode           root = document.DocumentNode;
                HtmlNodeCollection list = root.SelectNodes("//div[@class='row J_CopySong']");
                return(list.Where(p => p.Attributes.Any(q => q.Name == "data-cid" && !string.IsNullOrEmpty(q.Value))).Select(p => new SongInfo
                {
                    CopyrightId = p.Attributes["data-cid"].Value,
                    Name = p.SelectSingleNode(".//a[contains(@class,'song-name-txt')]")?.InnerText,
                    Artist = string.Join(",", p.SelectSingleNode("./div[contains(@class,'song-singers')]").SelectNodes("./a")?.Select(q => q.InnerText) ?? Array.Empty <string>()),
                    Album = p.SelectSingleNode("./div[contains(@class,'song-belongs')]").SelectSingleNode("./a")?.InnerText,
                    AlbumId = int.Parse(p.SelectSingleNode("./div[contains(@class,'song-belongs')]").SelectSingleNode("./a")?.Attributes["href"].Value.Split('/').Last() ?? "0")
                }).ToArray());
            }
            catch (JsonReaderException)
            {
                throw new NotImplementedException("意外的服务器返回");
            }
        }

示例#30

0

显示文件

文件： Scrapper.cs 项目： jirkaceska/C_sharp

        private void LoadUDrevaka()
        {
            DayMenu ParseDay(HtmlNode day)
            {
                string   dateStr = day.SelectSingleNode("./div[@class='menu-day']").InnerText;
                DateTime date    = Utils.ParseDateTime(dateStr);

                HtmlNodeCollection rows = day.SelectNodes("./div[@class='row']");
                int    soupIndex        = rows[0].SelectSingleNode("./div").InnerText.IndexOf("Polévka:", 0, 8);
                string soup             = soupIndex >= 0 ? rows[0].SelectSingleNode("./div").InnerText.Substring(9) : null;

                Food[] foods = rows
                               // Check if row is really food and not soup or note
                               .Where((row, index) => index > soupIndex &&
                                      row.SelectSingleNode("./div[@class='col-sm-10 col-xs-9']") != null &&
                                      row.SelectSingleNode("./div[@class='col-sm-2 col-xs-3 special-menu-price']") != null)
                               .Select((row) => new Food(
                                           HtmlEntity.DeEntitize(Utils.RemoveLeadingNumbers(
                                                                     row.SelectSingleNode("./div[@class='col-sm-10 col-xs-9']").InnerText)
                                                                 ),
                                           Utils.ParsePrice(
                                               row.SelectSingleNode("./div[@class='col-sm-2 col-xs-3 special-menu-price']").InnerText
                                               )
                                           )).ToArray();
                return(new DayMenu(date, soup, foods));
            }

            HtmlNode           doc  = Utils.GetHtmlDoc(Constants.udrevakaUrl).DocumentNode;
            HtmlNode           menu = doc.SelectSingleNode("//ul[@class='special-menu pb-xlg']");
            HtmlNodeCollection days = menu.SelectNodes("./li[@class='item-day']");

            DayMenu[] dayMenus       = days.Select(ParseDay).ToArray();
            string    restaurantName = GetRestaurantName(doc);

            SaveRestaurant(restaurantName, dayMenus, Restaurants.UDrevaka);
        }

C# (CSharp) HtmlNodeCollection.Where示例