public Load ( string url ) : HtmlAgilityPack.HtmlDocument | ||
url | string | The requested URL, such as "http://Myserver/Mypath/Myfile.asp". |
return | HtmlAgilityPack.HtmlDocument |
public datascraper() { string url = @"http://www.bbc.co.uk/sport/football/results/partial/competition-118996114"; HtmlWeb htmlWeb = new HtmlWeb(); HtmlDocument doc = new HtmlDocument{ OptionUseIdAttribute = true }; doc = htmlWeb.Load(url); HtmlNodeCollection mtchrslts = doc.DocumentNode.SelectNodes("//tr[@id]"); string date; string ateam; string hteam; string score; string idmess; string idnum; string[] teamscores; string teamscoreh; string teamscorea; foreach (HtmlNode matchresult in mtchrslts) { idmess = matchresult.SelectSingleNode("//tr[@id]").Id; idnum = idmess.Replace("match-row-", ""); score = matchresult.SelectSingleNode("//abbr[@title='Score']").InnerText; teamscores = score.Split('-'); teamscoreh = teamscores[0]; teamscorea = teamscores[1]; hteam = matchresult.SelectSingleNode("//p[(@class='team-home teams')]").InnerText; ateam = matchresult.SelectSingleNode("//p[(@class='team-away teams')]").InnerText; date = matchresult.SelectSingleNode("//td[(@class='match-date')]").InnerText; } return; }
public void Run() { HtmlWeb htmlWeb = new HtmlWeb(); HtmlDocument htmlDocument = htmlWeb.Load(WebConstants.BASE_URL + mCarBrand.Url); HtmlNode logoNode = HtmlNode.CreateNode(htmlDocument.DocumentNode.SelectSingleNode(WebConstants.BRAND_LOGO).OuterHtml); mCarBrand.LogoUrl = logoNode.SelectSingleNode(WebConstants.IMAGE_SRC).Attributes[WebConstants.SRC].Value; //new Thread(new BrandLogoDownloadTask(mCarBrand).Download).Start(); HtmlNode officialSiteNode = HtmlNode.CreateNode(htmlDocument.DocumentNode.SelectSingleNode(WebConstants.BRAND_OFFICIAL_SITE).OuterHtml); mCarBrand.OfficialSite = officialSiteNode.SelectSingleNode(WebConstants.LINK_HREF).Attributes[WebConstants.HREF].Value; HtmlNode countryNode = HtmlNode.CreateNode(htmlDocument.DocumentNode.SelectSingleNode(WebConstants.BRAND_COUNTRY).OuterHtml); mCarBrand.Country = new Country(countryNode.InnerText.Substring(countryNode.SelectSingleNode(WebConstants.EM).InnerText.Length)); mCarBrand.Country.LogoUrl = countryNode.SelectSingleNode(WebConstants.IMAGE_SRC).Attributes[WebConstants.SRC].Value; //new Thread(new CountryLogoDownloadTask(mCarBrand.Country).Download).Start(); HtmlNode brandListNode = HtmlNode.CreateNode(htmlDocument.DocumentNode.SelectSingleNode(WebConstants.BRAND_LIST).OuterHtml); mCarBrand.ListUrl = brandListNode.SelectSingleNode(WebConstants.SCRIPT_SRC).Attributes[WebConstants.SRC].Value; htmlDocument = htmlWeb.Load(WebConstants.BASE_URL + mCarBrand.ListUrl); HtmlNodeCollection factoryNodes = htmlDocument.DocumentNode.SelectNodes(WebConstants.FACTORY_NODE); if (factoryNodes != null) { foreach (HtmlNode tempNode in factoryNodes) { HtmlNode factoryNode = HtmlNode.CreateNode(tempNode.OuterHtml); CarFactory carFactory = new CarFactory(mCarBrand); carFactory.Url = factoryNode.SelectSingleNode(WebConstants.LINK_HREF).Attributes[WebConstants.HREF].Value; carFactory.Name = factoryNode.InnerText.Replace("/", ""); mCarBrand.CarFactoryList.Add(carFactory); } } runFactoryTasks(); }
public void GeneratePages() { HtmlWeb htmlWeb = new HtmlWeb(); _htmlDoc = htmlWeb.Load(this.Url); int MAX_PAGE = this.GetMaxPage(_htmlDoc); this.Pages = new List<WattpadPage>(); for (int i = 1; i <= MAX_PAGE; i++) { _htmlDoc = htmlWeb.Load(this.Url + "/page/" + i); IEnumerable<HtmlNode> bodyList = _htmlDoc.DocumentNode.SelectNodes("//div[@id='storyText']"); string content = ""; if (bodyList != null) { HtmlNode body = bodyList.ToList().First(); content = body.InnerHtml; } this.Pages.Add(new WattpadPage { PageNumber = i, Content = content }); } }
static void Download() { List<Vogel> voegel = new List<Vogel>(); Directory.CreateDirectory(IMAGES_FOLDER); Uri baseUri = new Uri("http://www.vogelwarte.ch"); HtmlWeb web = new HtmlWeb(); Uri queryUri = new Uri(baseUri, "voegel-der-schweiz.html?keyword=&mode=name%2CnameL&showPage=0&length=0&lang=de&exampleSearch=0"); Console.WriteLine(queryUri); var docQuery = web.Load(queryUri.ToString()); foreach(var elEntry in docQuery.DocumentNode.SelectNodes("//table[@class=\"list\"]/tr[@class=\"listEntry\"]/td/h3/a")) { Uri uriEntry = new Uri(baseUri, Decode(elEntry.Attributes["href"].Value)); Console.WriteLine(uriEntry); var docEntry = web.Load(uriEntry.ToString()); var nodeDetail = docEntry.DocumentNode.SelectSingleNode("//div[@id=\"birdDetail\"]"); Vogel vogel = new Vogel { Name = Decode(elEntry.InnerText), Gruppe = Decode(nodeDetail.SelectSingleNode("//td[strong/text()=\"Vogelgruppe:\"]").LastChild.InnerText), Lebensraum = Decode(nodeDetail.SelectSingleNode("//td[strong/text()=\"Lebensraum:\"]").LastChild.InnerText), Laenge = Decode(nodeDetail.SelectSingleNode("//td[strong/text()=\"Länge (cm):\"]").LastChild.InnerText), Bilder = nodeDetail.SelectNodes("//div[@id=\"gallery\"]/div/img").Select(nodeImg => new Bild { Titel = Decode(nodeImg.Attributes["title"].Value), Source = new Uri(baseUri, Decode(nodeImg.Attributes["src"].Value)).ToString() }).ToArray() }; voegel.Add(vogel); foreach(var bild in vogel.Bilder) { using(var client = new WebClient()) { Console.WriteLine(bild.Source); string strFile = Path.GetFileName(bild.Source); client.DownloadFile(bild.Source, IMAGES_FOLDER + "/" + strFile); bild.Source = strFile; } } } JavaScriptSerializer serializer = new JavaScriptSerializer(); using(StreamWriter sr = File.CreateText("data.js")) { sr.Write("var Voegel = "); sr.Write(serializer.Serialize(voegel.OrderBy(v => v.Name))); sr.Write(";\r\nvar Gruppen = "); sr.Write(serializer.Serialize(voegel.Select(v => v.Gruppe).Distinct().OrderBy(g => g))); sr.Write(";\r\nvar Lebensraeume = "); sr.Write(serializer.Serialize(voegel.SelectMany(v => v.Lebensraum.Split(',').Select(l => l.Trim())).Distinct().OrderBy(l => l))); sr.Write(";"); } }
public static ChapterData getChapters(Source source, string link) { ChapterData chapter = new ChapterData(); var web = new HtmlAgilityPack.HtmlWeb(); web.AutoDetectEncoding = true; var htmlpage1 = web.Load(link); var pages = new List<IObservable<HtmlDocument>>(); pages.Add(Observable.Return(htmlpage1)); var linksToPages = htmlpage1.DocumentNode.SelectNodes(@"//*[@id='pageMenu']/option"); for (int i = 1; i < linksToPages.Count; i++) { var linkToPage ="http://www.mangareader.net"+ linksToPages[i].GetAttributeValue("value", ""); pages.Add(Observable.Start<HtmlDocument>( () => { var web2 = new HtmlAgilityPack.HtmlWeb(); web.AutoDetectEncoding = true; return htmlpage1 = web.Load(linkToPage); } )); } foreach (IObservable<HtmlDocument> item in pages) { HtmlDocument pagehtml = item.Wait(); chapter.Images.Add(pagehtml.DocumentNode.SelectSingleNode(@"//*[@id='img']").GetAttributeValue("src", "")); } return chapter; }
public static ChapterData getChapters(Source source, string link) { ChapterData chapter = new ChapterData(); var web = new HtmlAgilityPack.HtmlWeb(); web.AutoDetectEncoding = true; var htmlpage1 = web.Load(link); var pages = new List<IObservable<HtmlDocument>>(); pages.Add(Observable.Return(htmlpage1)); var linksToPages = htmlpage1.DocumentNode.SelectNodes(@"/html/body/section[@class='readpage_top']/div[@class='go_page clearfix']/span[@class='right']/select[@class='wid60']/option"); for (int i = 1; i < linksToPages.Count; i++) { var linkToPage=linksToPages[i].GetAttributeValue("value", ""); pages.Add(Observable.Start<HtmlDocument>( ()=>{ var web2 = new HtmlAgilityPack.HtmlWeb(); web.AutoDetectEncoding = true; return htmlpage1 = web.Load(linkToPage); } )); } foreach (IObservable<HtmlDocument> item in pages) { HtmlDocument pagehtml = item.Wait(); chapter.Images.Add(pagehtml.DocumentNode.SelectSingleNode(@"/html/body/section[@id='viewer']/a/img[@id='image']/@src").GetAttributeValue("src","")) ; } return chapter; }
static void Main() { string mainUrl = "http://urfu.ru/"; string toSchedule = "student/schedule/schedule/list/institute/"; string getInstitutes = "student/schedule/schedule/list/group/institute"; string getGroups = "student/schedule/schedule/list/lesson/institute"; var webGet = new HtmlAgilityPack.HtmlWeb(); var doc = webGet.Load(mainUrl + toSchedule); var listInstituts = doc.DocumentNode.SelectNodes("//a").Where(item => item.Attributes["href"].Value.StartsWith(getInstitutes)); //foreach (var institut in listInstituts) var institut = listInstituts.Skip(7).First(); { Console.WriteLine(institut.InnerHtml); doc = webGet.Load(mainUrl + institut.Attributes["href"].Value); var listGroups = doc.DocumentNode.SelectNodes("//a").Where(item => item.Attributes["href"].Value.StartsWith(getGroups)); foreach (var group in listGroups) { Console.WriteLine(group.InnerHtml); doc = webGet.Load(mainUrl + group.Attributes["href"].Value); var schedule = doc.DocumentNode.SelectNodes("//table");//.Where(item => item.Attributes["class"]) new Schedule(schedule); } } //doc = webGet.Load("http://urfu.ru/student/schedule/schedule/list/lesson/institute/6/sch_group/419/week/odd/semi_semester/2/"); //var schedule = doc.DocumentNode.SelectNodes("//table").Where(item => item.Attributes["class"] != null); //new Schedule(schedule); }
public void storedata() { //url 변수 try { //첫페이지의 데이터를 수집한다. web = new HtmlAgilityPack.HtmlWeb(); document = web.Load(naverlink); document3 = web.Load(naverlink); collectdata(document); //나머지페이지의 데이터를 수집한다. int index3 = 0; int index4 = 0; int tmp2 = naverlink.IndexOf("=") + 1; int tmp3 = naverlink.IndexOf("&"); int tmp4 = naverlink.IndexOf("query=") + 6; int tmp5 = naverlink.Length; String nvMid = naverlink.Substring(tmp2, tmp3 - tmp2); String query = naverlink.Substring(tmp4, tmp5 - tmp4); String page; String url; var VARIABLES = document.DocumentNode.SelectSingleNode(".//div[@class='co_paginate']").Descendants().Where(x => x.Name == "a"); foreach (var VARIABLE in VARIABLES) { page = VARIABLE.GetAttributeValue("onclick", ""); index3 = page.IndexOf("(") + 1; index4 = page.IndexOf(","); page = page.Substring(index3, index4 - index3); url = "http://shopping.naver.com/detail/section_price_compare.nhn?nvMid=" + nvMid + "&pkey=0&pkey2=0&mallSeq=all&fee=all&page=" + page + "&frm=NVSHATC&query=" + query; HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url); request.Method = "GET"; request.Referer = "http://shopping.naver.com/detail/detail.nhn?nv_mid=9535864708&cat_id=50000151&frm=NVSHATC&query=%EC%82%BC%EC%84%B1%EC%A0%84%EC%9E%90+%EB%85%B8%ED%8A%B8%EB%B6%819+metal+NT900X3L-K58S"; HttpWebResponse response = (HttpWebResponse)request.GetResponse(); StreamReader reader = new StreamReader(response.GetResponseStream()); document3.LoadHtml(reader.ReadToEnd()); collectdata(document3); } } catch (WebException e) { Console.WriteLine("네이버url 변수 WebException"); } catch (HtmlWebException e) { Console.WriteLine("네이버url 변수 WebException"); } catch (UriFormatException e) { Console.WriteLine("네이버url 변수 WebException"); } catch (NullReferenceException e) { Console.WriteLine("네이버url 변수 NullReferenceException"); } }
public void ProcessIndex(int index) { var novel = new Novel(); HtmlWeb htmlWeb = new HtmlWeb(); HtmlDocument htmlDocument = htmlWeb.Load(String.Format(MainUrlPattern, index)); var mainboxes = htmlDocument.DocumentNode.Descendants("div").Where(w => w.HasClass("mainbox")).ToArray(); var mainContent = mainboxes[0]; ParseMainContent(mainContent, novel); var releasesNode = htmlDocument.DocumentNode.Descendants("div").FirstOrDefault(w => w.HasClass("releases")); if (releasesNode != null) { ParseReleasesContent(releasesNode, novel); } var screenshotsNode = htmlDocument.DocumentNode.Descendants("div").FirstOrDefault(w => w.HasId("screenshots")); if (screenshotsNode != null) { ParseImagesContent(screenshotsNode, novel); } //staff (extract artists) htmlDocument = htmlWeb.Load(String.Format(StaffPattern, index)); var staffNode = htmlDocument.DocumentNode.Descendants("div").FirstOrDefault(w => w.HasClass("staff") && w.NotContainsClass("cast")); if (staffNode != null) { ParseStaffContent(staffNode, novel); } //characters htmlDocument = htmlWeb.Load(String.Format(CharacterPattern, index)); mainboxes = htmlDocument.DocumentNode.Descendants("div").Where(w => w.HasClass("mainbox")).ToArray(); if (mainboxes.Length > 1) { for (int i = 1; i < mainboxes.Length; i++) { ParseCharactersContent(mainboxes[i], novel); } } using (var ctx = new VNContext("VNConnectionString")) { NovelManager.SaveNovel(novel, ctx); Logs.Debug($@"Novel {index} finished"); } Console.WriteLine(index + @" finished"); }
public HtmlDocument ReadLink(string url) { HtmlAgilityPack.HtmlWeb htmlWeb = new HtmlAgilityPack.HtmlWeb(); htmlWeb.UserAgent = "Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0"; HtmlAgilityPack.HtmlDocument htmlDoc = htmlWeb.Load(url); return(htmlDoc); }
public static void comicDown(string url, string path) { //https://www.comicextra.com/invincible-iron-man-2015/chapter-14 if (url.Substring(url.Length - 5) != "/full") { url = (url + "/full"); } HtmlAgilityPack.HtmlWeb hw = new HtmlAgilityPack.HtmlWeb(); HtmlAgilityPack.HtmlDocument doc = hw.Load(url); List <string> linky = new List <string>(); foreach (HtmlNode link in doc.DocumentNode.SelectNodes("//img[@class='chapter_img']")) { //string.Empty string hrefValue = link.GetAttributeValue("src", ""); linky.Add(hrefValue); //System.Threading.Thread.Sleep(100); } foreach (var item in linky) { Console.WriteLine(item); } string refer = "referer: https://www.comicextra.com/invincible-iron-man-2015/chapter-14/full"; string FolderName = url.Substring(url.Length - 15); downloadFunctions.Download(linky, path, refer, FolderName); }
static void Main(string[] args) { Console.WriteLine("Silahkan masukkan jenis buku..."); string searchKey = Console.ReadLine(); Console.WriteLine("Sedang mencari informasi buku..."); baseUrl = baseUrl.Replace("query=", "query=" + searchKey).Replace(" ", "+"); HtmlAgilityPack.HtmlWeb web = new HtmlAgilityPack.HtmlWeb(); HtmlAgilityPack.HtmlDocument doc = web.Load(baseUrl); var bookName = doc.DocumentNode.SelectNodes("//a[@class='bookTitle']//span[@itemprop='name']").ToList(); var penulis = doc.DocumentNode.SelectNodes("//a[@class='authorName']//span[@itemprop='name']").ToList(); var rating = doc.DocumentNode.SelectNodes("//span[@class='minirating']").ToList(); foreach (var book in bookName) { bookList.Add(book.InnerText); } foreach (var writter in penulis) { penulisList.Add(writter.InnerText); } foreach (var rate in rating) { ratingList.Add(rate.InnerText); } showOutput(); Console.ReadLine(); }
private void button1_Click(object sender, EventArgs e) { //antes de esto inspeccionar en la web y poner control + F para hallar lo que se necesita en el selectedNode HtmlAgilityPack.HtmlWeb web = new HtmlAgilityPack.HtmlWeb(); HtmlAgilityPack.HtmlDocument doc = web.Load("https://coincost.net/es/currencies"); foreach (var item in doc.DocumentNode.SelectNodes("//td [@class = 'price'] //p")) { preciosMalo.Add(item.InnerText); } for (int n = 0; n < preciosMalo.Count; n = n + 2) { string usd = preciosMalo[n]; string numero = usd.Substring(4); precios.Add(numero); listBox1.Items.Add(numero); } listBox1.Items.Add("-------------------------------------"); foreach (var item in doc.DocumentNode.SelectNodes("//td [@class = 'title'] //span")) { nombresMalo.Add(item.InnerText); } for (int d = 1; d < nombresMalo.Count; d = d + 2) { nombres.Add(nombresMalo[d]); listBox1.Items.Add(nombresMalo[d]); } //buscar ultimo valor = "//td[@class = 'right tar']" //buscar simbolo = "//b" }
private static string[] PrepareTestData(string Filename) { if (File.Exists(Filename)) { return(File.ReadAllLines(Filename)); } else { Console.WriteLine("Preparing test data - reading..."); List <String> rs = new List <string>(); foreach (var i in GetAlphabet()) { Console.Write(i); string addr = String.Format("http://en.wikipedia.org/wiki/Index_of_Windows_games_({0})", i); HtmlAgilityPack.HtmlWeb w = new HtmlAgilityPack.HtmlWeb(); HtmlDocument d = w.Load(addr); rs.AddRange(d.DocumentNode.SelectNodes(GetAddress(i)).Select(t => t.InnerText)); } File.WriteAllLines(Filename, rs); Console.WriteLine("Done!"); return(rs.ToArray()); } }
private void AddGithubJobs(string url, List<JobListing> jobListings) { HtmlWeb page = new HtmlWeb(); var document = page.Load(url); string baseURL = "https://jobs.github.com"; try { HtmlNodeCollection rows = document.DocumentNode .SelectSingleNode("//table[@class='positionlist']") .SelectNodes(".//td[@class='title']"); if (rows.Count > 0) { foreach (HtmlNode row in rows) { if (row.ChildNodes.Count == 5) { jobListings.Add(new JobListing() { SearchEngine = SearchEngines.GitHub, Title = row.ChildNodes[1].InnerText, Company = row.ChildNodes[3].ChildNodes[1].InnerText, URL = baseURL + row.ChildNodes[1].ChildNodes[0].Attributes[0].Value }); } } } } catch { } }
public void getCoworkerNames() { if (updateCoworkerWarningBoolean()) { try { var web = new HtmlAgilityPack.HtmlWeb(); var doc = web.Load("http://10.45.10.149/brdkServices/EmployeeDB/"); var nodes = doc.DocumentNode .SelectNodes("//*[@id=\"bootstrap-override\"]/div[1]/div/table/tbody//tr/td[1]").ToList(); var innerTexts = nodes.Select(node => node.InnerText).ToList(); for (int i = 0; i < innerTexts.Count(); i++) { if (i % 7 == 0) { names.Add(innerTexts[i]); } } setCoworkerstoFile(); } catch (System.Net.WebException) { getCoworkersFromFile(); } initDDL(); } else { return; } }
public List <String> ReviewUrl(Source source) { var web = new HtmlAgilityPack.HtmlWeb(); var document = web.Load(source.Domain + source.Path); var page = document.DocumentNode; List <String> ListUrl = new List <string>(); foreach (var item in page.QuerySelectorAll(source.LinkSelector)) { try { var url = item.GetAttributeValue("href", ""); Debug.WriteLine(url); if (url != null && url != "") { if (url.StartsWith("/")) { url = source.Domain.TrimEnd('/') + url; } ListUrl.Add(url); } } catch (Exception ex) { Debug.WriteLine(ex); } } return(ListUrl); }
public static string getContent(string webAddress) { HtmlAgilityPack.HtmlWeb web = new HtmlAgilityPack.HtmlWeb(); HtmlDocument doc = web.Load(webAddress); return(string.Join(" ", doc.DocumentNode.Descendants().Select(x => x.InnerText))); }
public List<TimetableItem> GetTimetableForYear(StudyYear year, HalfYear halfYear = HalfYear.None) { List<TimetableItem> timetable; string tempYear = Enum.GetName(typeof(StudyYear), year); string tempHalfYear = Enum.GetName(typeof(HalfYear), halfYear); if (tempHalfYear == "None") tempHalfYear = String.Empty; try { HtmlWeb hw = new HtmlWeb(); HtmlDocument doc = hw.Load(String.Format("http://thor.info.uaic.ro/~orar/participanti/orar_{0}{1}.html", tempYear, halfYear)); doc.DocumentNode.InnerHtml = doc.DocumentNode.InnerHtml.Replace("\r\n", ""); timetable = ParseTable(doc, TimetableType.Year); } catch (WebException ex) { Logger.ExceptionLogger.Log(ex); timetable = null; } catch (NotSupportedException ex) { Logger.ExceptionLogger.Log(ex); timetable = null; } return timetable; }
//Public Methods public void Scrape(string url) { try { HtmlWeb hw = new HtmlWeb(); HtmlDocument doc = hw.Load(url); foreach(HtmlNode link in doc.DocumentNode.SelectNodes("//a[@href]")) { try { HtmlAttribute att = link.Attributes["href"]; Console.WriteLine(att.Value); this._results.Add(new Uri(att.Value)); } catch { } } } catch { //What Should I Do Here? //Maybe Nothing for Now } }
public override List<NewsObject> NewestNews(int page) { string htmlUrl = RootUrl; if (page > 1) { htmlUrl = RootUrl + "?wpage=" + page; } List<NewsObject> results = new List<NewsObject>(); HtmlWeb htmlWeb = new HtmlWeb() { AutoDetectEncoding = false, OverrideEncoding = Encoding.UTF8 //Set UTF8 để hiển thị tiếng Việt }; HtmlDocument document = htmlWeb.Load(htmlUrl); var threadItems = document.DocumentNode.QuerySelectorAll("div.recentNews").ToList(); foreach (var item in threadItems) { NewsObject news = new NewsObject(); var linkNode = item.QuerySelector("h2.subHeading"); var link = linkNode.QuerySelector("a").Attributes["href"].Value; news.Link = RootUrl + link; news.Text = TrimHtml(linkNode.InnerText); results.Add(news); } return results; }
public List<string> GetHrefLinks(string url) { //var webDocument = new HtmlDocument(); //webDocument.Load(GetHtml(url)); // Get a page from remote server var webGet = new HtmlWeb(); var webDocument = webGet.Load(url); var linksOnPage = from lnks in webDocument.DocumentNode.Descendants() where lnks.Name == "a" && lnks.Attributes["href"] != null && lnks.InnerText.Trim().Length > 0 select new { Url = lnks.Attributes["href"].Value, Text = lnks.InnerText }; List<string> newList=new List<string>(); foreach (var item in linksOnPage) { //newList.Add(item.Url+" [[[[["+item.Text+"]]]]]"); //For now let's just pick Url newList.Add(item.Url); } return newList; }
protected void Button1_Click(object sender, EventArgs e) { try { HtmlAgilityPack.HtmlWeb web = new HtmlAgilityPack.HtmlWeb(); HtmlAgilityPack.HtmlDocument doc = web.Load(TextBox1.Text); var Articles = doc.DocumentNode.SelectNodes("//*[@class ='article-single']"); foreach (var article in Articles) { var header = HttpUtility.HtmlDecode(article.SelectSingleNode(".//li[@class='article-header']" + "\n").InnerText); var description = HttpUtility.HtmlDecode(article.SelectSingleNode(".//li[@class='article-copy']").InnerText); Response.Write("<Table>"); Response.Write("<td>"); Response.Write("Name - " + header); Response.Write("<br />"); Response.Write(" Description - " + description); Response.Write("<tr />"); Response.Write("<td />"); Response.Write("</Table>"); } } catch (Exception ex) { Response.Write(ex.Message); } }
long NumOfHits(string phrase) { HtmlAgilityPack.HtmlWeb web = new HtmlAgilityPack.HtmlWeb(); HtmlAgilityPack.HtmlDocument htmlDoc = web.Load("https://www.google.com/search?q=" + phrase); if (htmlDoc.ParseErrors != null && htmlDoc.ParseErrors.Count() > 0) { // Handle any parse errors as requiredcw System.Console.WriteLine("error"); debug.Print("error\n"); return(-1); } else if (htmlDoc.DocumentNode != null) { HtmlAgilityPack.HtmlNode node = htmlDoc.DocumentNode.SelectSingleNode("//div[@id='resultStats']"); Regex re = new Regex(@"[1-9](?:\d{0,2})(?:,\d{3})*(?:\.\d*[1-9])?|0?\.\d*[1-9]|0"); String result = re.Match(node.InnerHtml).Value; long hits = 0; if (result.Contains(",")) { hits = long.Parse(result.Replace(",", "")); } //System.Console.WriteLine(hits); return(hits); } return(-1); }
static List <HtmlNode> ExtractPostsFromUrl(String url) { HtmlAgilityPack.HtmlWeb web = new HtmlAgilityPack.HtmlWeb(); HtmlAgilityPack.HtmlDocument doc = web.Load(url); return(doc.DocumentNode.SelectNodes("//tr[@class='athing']").Cast <HtmlNode>().ToList()); }
public List<string> GetData(string url) { if (url == "http://") { url = "http://www.microsoft.com"; } // Get a page from remote server var webGet = new HtmlWeb(); var document = webGet.Load(url); var metaTags = document.DocumentNode.SelectNodes("//meta"); List<string> output = new List<string>(); if (metaTags != null) { foreach (var tag in metaTags) { if (tag.Attributes["name"] != null && tag.Attributes["content"] != null) { output.Add("Name="+tag.Attributes["name"].Value); output.Add("Content="+tag.Attributes["content"].Value); } } } // return answer return output; }
//query gametracker by map public static List<string> GetServersFromMap(List<string> list, string map) { HtmlWeb htmlWeb = new HtmlWeb(); // Creates an HtmlDocument object from an URL HtmlAgilityPack.HtmlDocument document = htmlWeb.Load("http://www.gametracker.com/search/dota2/?search_by=map&query="+map.Trim()+"&searchipp=50"); var query = from table in document.DocumentNode.SelectNodes("//table").Cast<HtmlNode>() from row in table.SelectNodes("tr").Cast<HtmlNode>() from cell in row.SelectNodes("td").Cast<HtmlNode>() select new { Table = table.Id, CellText = cell.InnerText, CellClass = cell.Attributes }; string rep = ""; bool started = false; bool stopped = true; foreach (var cell in query) { if (cell.CellText.Contains("Rank&darr")) { stopped = !stopped; started = false; } if (started && !stopped) { list.Add(cell.CellText.Trim()); } if (cell.CellText.Contains("Server Map ")) { started = true; } } return list; }
public static Bilinformation HentBilinformation(string nummerplade) { try { Bilinformation bilinformation = new Bilinformation(); string html = "http://www.nummerplade.net/soeg/?regnr=" + nummerplade; HtmlWeb web = new HtmlWeb(); HtmlDocument page = web.Load(html); if (page.DocumentNode != null) { bilinformation.Maerke = page.DocumentNode.SelectSingleNode("//td[@id='maerke']").InnerText; bilinformation.Model = page.DocumentNode.SelectSingleNode("//td[@id='model']").InnerText; bilinformation.Variant = page.DocumentNode.SelectSingleNode("//td[@id='variant']").InnerText; bilinformation.Stelnummer = page.DocumentNode.SelectSingleNode("//td[@id='stelnr']").InnerText; bilinformation.Aargang = page.DocumentNode.SelectSingleNode("//td[@id='model_aar']").InnerText; bilinformation.Nummerplade = page.DocumentNode.SelectSingleNode("//td[@id='regnr']").InnerText; } return bilinformation; } catch (Exception ex) { throw new IngenBilinformationException("Der blev ikke fundet nogen bilinformation på nummerpladen.", ex); } }
public void GetJobListFromWeb() { try { var htmlWeb = new HtmlWeb { OverrideEncoding = Encoding.GetEncoding("UTF-8") }; HtmlDocument htmlDoc = htmlWeb.Load(string.Format("http://sou.zhaopin.com/jobs/searchresult.ashx?jl={0}&kw={1}&p={2}", DataClass.GetDic_zhilian(_pars.Addr), _pars.Key, _pars.Page)); var nodeList = htmlDoc.DocumentNode.SelectNodes("//*[@id='newlist_list_content_table']/table[@class='newlist']") .AsParallel() .ToList(); for (int i = 1; i < nodeList.Count; i++) { var node = nodeList[i]; var job = new JobInfo(); job.TitleName = node.SelectSingleNode(".//tr/td[@class='zwmc']/div/a").InnerText; job.InfoUrl = node.SelectSingleNode(".//tr/td[@class='zwmc']/div/a").Attributes["href"].Value; job.Company = node.SelectSingleNode(".//tr/td[@class='gsmc']/a").InnerText; job.Salary = node.SelectSingleNode(".//tr/td[@class='zwyx']").InnerText; job.City = node.SelectSingleNode(".//tr/td[@class='gzdd']").InnerText; job.Date = node.SelectSingleNode(".//tr/td[@class='gxsj']/span").InnerText; job.Source = "智联招聘"; job.Method = "月薪"; _jobList.Add(job); } } catch (Exception ex) { LogSave.ErrLogSave("错误【解析】", ex); } }
//find out all the plumber information in a city private void ExtractCity(string state, string city) { HtmlWeb web = new HtmlWeb(); string cityUrl = RootUrl + @"/" + state + @"/" + city + @"?" + @"page=1&ipp=All"; HtmlDocument doc = web.Load(cityUrl); var linksOnPage = from lnks in doc.DocumentNode.Descendants() where lnks.Name == "a" && lnks.Attributes["href"] != null && lnks.InnerText.Trim().Length > 0 select lnks; foreach (var li in linksOnPage) { if (li.InnerText == "Phone") { string phone, name, address; phone = li.ParentNode.NextSibling.InnerText; Console.WriteLine(); Console.WriteLine("phone: "+phone); name = li.ParentNode.ParentNode.ParentNode.ParentNode.FirstChild.NextSibling.InnerText.Split('\n')[1].Trim(); address = li.ParentNode.ParentNode.ParentNode.ParentNode.FirstChild.NextSibling.InnerText.Split('\n')[2].Trim(); Console.WriteLine("name: "+ name); Console.WriteLine("address: " + address); file.WriteLine(name + "," + address + "," + city.Replace("-plumbers","") + "," + phone.Replace(" ", "")); } } }
public string getBibTex(string url) { string res = "", temp = ""; HtmlWeb web; HtmlDocument doc; HtmlNode n; if (url.Contains("viewdoc"))//e.g. http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.31.3487 { web = new HtmlWeb(); doc = web.Load(url); if (doc != null) Console.WriteLine("Document Loaded!"); else Console.WriteLine("Load Error!"); try { if ((n = doc.DocumentNode.SelectSingleNode("//*[@id=\"bibtex\"]/p")) != null) { temp = n.InnerText; temp = temp.Replace(",", ",\n").Replace(" ", " "); } } catch (Exception e) { } res = temp; return res; } else//e.g. http://citeseer.ist.psu.edu/showciting?cid=2131272 return res; }
// return remote page title from URI public static string GetTitleFromUri(string @remoteUri) { try { // try using Open Graph to get target page title var graph = OpenGraph.ParseUrl(@remoteUri, "Voat.co OpenGraph Parser"); if (!string.IsNullOrEmpty(graph.Title)) { var tmpStringWriter = new StringWriter(); HttpUtility.HtmlDecode(graph.Title, tmpStringWriter); return tmpStringWriter.ToString(); } // Open Graph parsing failed, try getting HTML TITLE tag instead HtmlWeb htmlWeb = new HtmlWeb(); HtmlDocument htmlDocument = htmlWeb.Load(@remoteUri); if (htmlDocument != null) { var titleNode = htmlDocument.DocumentNode.Descendants("title").SingleOrDefault(); if (titleNode != null) { return titleNode.InnerText; } } return null; } catch (Exception ex) { return null; } }
protected override void OnCreate (Bundle savedInstanceState) { base.OnCreate (savedInstanceState); SetContentView (Resource.Layout.Main); TextView textView = FindViewById<TextView> (Resource.Id.TEXT_STATUS_ID); HtmlWeb web = new HtmlWeb(); HtmlDocument doc = web.Load("https://www.ltd.org/system-map/route_79x/"); HtmlNodeCollection tags = doc.DocumentNode.SelectNodes("//td"); foreach (HtmlNode item in tags) { textView.Text = textView.Text + item.InnerHtml +"\n"; } textView.Text = Regex.Replace(textView.Text, @"<[^>]*>", String.Empty); Button button = FindViewById<Button> (Resource.Id.myButton); button.Click += delegate { //button.Text = string.Format ("{0} clicks!", count++); StartActivity(typeof(Page2)); }; }
public static void GetText2() { List<string> outList = new List<string>(); string html = "https://yandex.by/search/?numdoc=10&p=0&rdrnd=601861&text=kinogo.co%20Один%20дома%201990%20&lr=157"; HtmlDocument HD = new HtmlDocument(); var web = new HtmlWeb { AutoDetectEncoding = false, OverrideEncoding = Encoding.UTF8 //GetEncoding("windows-1251") }; HD = web.Load(html); HtmlNodeCollection NoAltElements = HD.DocumentNode.SelectNodes("//div"); ///допилить if (NoAltElements != null) { foreach(HtmlNode node in NoAltElements) { string outputText = node.InnerHtml; Console.WriteLine(outputText); } } else Console.WriteLine("found nothing"); }
public void SearchLinks(string gametitle, string imagetype, string searchstring) { searchstring = searchstring.Replace(" ", "%20"); var url = "https://www.qwant.com/?q=" + searchstring + "&t=images"; try { HtmlAgilityPack.HtmlWeb hw = new HtmlAgilityPack.HtmlWeb(); HtmlAgilityPack.HtmlDocument doc = hw.Load(url); foreach (HtmlNode link in doc.DocumentNode.SelectNodes("//img")) { string imgValue = link.GetAttributeValue("src", string.Empty); string[] imgLink = imgValue.Split('='); string imglink = imgLink[1].Replace("%3A", ":"); imglink = imglink.Replace("%2F", "/"); imglink = imglink.Remove(imglink.Length - 2); imgValue = "http:" + imgValue; imgValue = imgValue.Substring(0, imgValue.LastIndexOf("&q=")); searchlist.Add(new SearchResults { Thumbnail = imgValue, Image = imglink }); } SearchList = searchlist; ObservableList(); } catch (Exception e) { Console.WriteLine("Error: " + e); } }
static HtmlNodeCollection GetSuburb(string URL) { HtmlWeb client = new HtmlWeb(); string suburbURL = System.Net.WebUtility.HtmlDecode(BASE + URL); HtmlDocument doc = client.Load(suburbURL); return doc.DocumentNode.SelectNodes("//table[@id='myTable']/tbody/tr/td[4]/a"); }
public void setName() { if (name != null) { return; } string newname; var web = new HtmlAgilityPack.HtmlWeb(); HtmlDocument doc = web.Load(this._urlLink); var h1 = doc.DocumentNode.SelectSingleNode("//h1"); var title = doc.DocumentNode.SelectSingleNode("//title"); if (h1 != null && !h1.HasChildNodes) { newname = h1.InnerHtml.Trim(); } else if (title != null && !title.HasChildNodes) { newname = title.InnerHtml.Trim(); } else { string n = Regex.Replace(urlLink, @"^((https:[/]*|http:[/]*)(www)*|(www.))[.]*", ""); string f = Regex.Replace(n, @"[.].*$", ""); newname = f; } name = newname; }
void getData(String url, Mode mode) { if (mode == Mode.Refresh) { chapter.Clear(); } HtmlAgilityPack.HtmlWeb htmlWeb = new HtmlAgilityPack.HtmlWeb(); htmlDoc = htmlWeb.Load(url); getInfo(htmlDoc); getSummary(htmlDoc); HtmlNode _nod = htmlDoc.DocumentNode.SelectSingleNode(@"//table[@class='table table-striped']"); HtmlNodeCollection _mainNode = _nod.SelectNodes("tr"); foreach (HtmlNode node in _mainNode) { HtmlNode chap = node.SelectSingleNode("td[2]"); HtmlNode n = node.SelectSingleNode("td[3]"); if (n != null) { String displayName = chap.SelectSingleNode("strong").InnerText + " : " + n.SelectSingleNode("a").InnerText; String chapterUrl = n.SelectSingleNode("a").GetAttributeValue("href", null); chapter.Add(new Chapter(displayName, chapterUrl)); } } currentPage = Convert.ToInt16(htmlDoc.DocumentNode.SelectSingleNode(@"//a[@title='current-page']").InnerText); lvChapter.ItemsSource = chapter; checkNextPage(htmlDoc); }
static void Main(string[] args) { var web = new HtmlWeb(); var doc = web.Load("https://ua.linkedin.com/in/kirillmiroshnichenko"); var name = doc.DocumentNode.SelectNodes("//span[@class='full-name']"); Print(name); var summary = doc.DocumentNode.SelectNodes("//p[@class='description']"); Print(summary); var skills = doc.DocumentNode.SelectNodes("//span[@class='skill-pill']"); Print(skills); Console.WriteLine("-------------"); string[] values = new string[] {"experience", "courses","projects","certifications", "languages", "education","interests", "patents","publications","honors","test-scores","organizations","volunteering"}; foreach (var item in values) { Info(doc, item); } Console.ReadLine(); }
private IDictionary <string, string> getVideoUrls(string mainUrl) { var videoSources = new Dictionary <string, string>(); var web = new HAP.HtmlWeb(); var doc = web.Load(mainUrl); var videoSelectorTable = doc.DocumentNode.SelectSingleNode("//td[@id='embedcode']").ParentNode.ParentNode; foreach (var node in videoSelectorTable.SelectNodes("//a")) { if (node.Attributes.Contains("onclick")) { var jscriptCode = node.Attributes["onclick"].Value; const string pivot = "unescape('"; var srtIdx = jscriptCode.IndexOf(pivot) + pivot.Length; var endIdx = jscriptCode.IndexOf("'", srtIdx + 1); var redirectUrlEncoded = jscriptCode.Substring(srtIdx, endIdx - srtIdx); var redirectUrl = WebUtility.UrlDecode(redirectUrlEncoded); var scriptDoc = new HAP.HtmlDocument(); scriptDoc.LoadHtml(redirectUrl); string videoPageUrl = scriptDoc.DocumentNode.SelectSingleNode("//iframe").Attributes["src"].Value; videoSources.Add(node.InnerText, videoPageUrl); } } return(videoSources); }
public IEnumerable<Podcast> GetLatestPodcasts(int pageNumber) { var hw = new HtmlWeb(); hw.OverrideEncoding = Encoding.GetEncoding("ISO-8859-2"); var doc = hw.Load("http://www.tok.fm/TOKFM/0,94037.html?str=" + pageNumber.ToString(CultureInfo.InvariantCulture)); doc.OptionOutputAsXml = true; doc.OptionCheckSyntax = true; doc.OptionFixNestedTags = true; var sb = new StringBuilder(); var stringWriter = new StringWriter(sb); doc.Save(stringWriter); var page = sb.ToString(); var stringReader = new StringReader(page); doc.Load(stringReader); var result = new List<Podcast>(); foreach(HtmlNode link in doc.DocumentNode.SelectNodes("//a[@class='tokfm_play']")) { var imgNode = link.SelectSingleNode("img"); var imageURL = String.Empty; if (imgNode != null) imageURL = imgNode.Attributes["src"].Value; result.Add(new Podcast { Href = link.Attributes["href"].Value, Title = link.Attributes["title"].Value, ImageURL = imageURL }); } return result; }
public static HtmlDocument Crawl(string url) { HtmlWeb hw = new HtmlWeb(); HtmlDocument html = hw.Load(url); return html; }
public static List<string> GetChapterUrls(string url) { HtmlAgilityPack.HtmlWeb htmlWeb = new HtmlWeb(); HtmlAgilityPack.HtmlDocument htdoc = htmlWeb.Load(url); List<string> ret = new List<string>(); IEnumerable<HtmlAgilityPack.HtmlNode> selectList = htdoc.DocumentNode.Descendants("select") .Where(x => x.Attributes["class"].Value == "selectBox"); if (selectList.ToList().Count == 0) return null; if (selectList == null) return null; var selectElement = selectList.Single(); foreach (var cNode in selectElement.ChildNodes) { if (cNode.Name == "option") { ret.Add(cNode.GetAttributeValue("value", "NO_URL")); } } //cleanups ret.Remove("#"); return ret; }
public static string GetFromTerra(string artist, string title) { string rep = string.Empty; artist = (artist + "").ToLowerInvariant(); title = (title + "").ToLowerInvariant(); //Obter a letra da música HtmlWeb web = new HtmlWeb(); HtmlDocument doc = web.Load(string.Format("http://letras.mus.br/winamp.php?t={0}-{1}", HttpUtility.UrlEncode(artist, ISOEncoding), HttpUtility.UrlEncode(title, ISOEncoding))); HtmlNode node = doc.DocumentNode.SelectSingleNode("//div[@id='letra']/p"); //Se encontrar a letra, retorna if (node == null && (artist.Contains("&") || title.Contains("&"))) { artist = artist.Replace('&', 'e'); title = title.Replace('&', 'e'); return GetFromTerra(artist, title); } node.InnerHtml = node.InnerHtml.Replace("<br>", "\r\n"); rep = WebUtility.HtmlDecode(node.InnerText); return rep; }
protected override string _GetSerieMiniatureUrl(Serie serie) { var web = new HtmlWeb(); var doc = web.Load(serie.URL); var img = doc.DocumentNode.SelectSingleNode("//div[@id='series_info']/div[@class='cover']/img"); return img.GetAttributeValue("src", ""); }
public static List<string> getNameOfEmail(string url) { List<string> a = new List<string>(); HtmlWeb website = new HtmlWeb(); HtmlAgilityPack.HtmlDocument doc = website.Load(url); HtmlNodeCollection authors = new HtmlNodeCollection(doc.DocumentNode.ParentNode); ; authors = doc.DocumentNode.SelectNodes(".//li[@itemprop='author']"); if (!Directory.Exists(@"C:\Springer\")) { Directory.CreateDirectory(@"C:\Springer\"); } using (StreamWriter outputFile = new StreamWriter(@"C:\Springer\Springer Emails.txt", true)) { if (authors != null) { foreach (HtmlNode author in authors) { HtmlNode Name = author.SelectSingleNode(".//a[@class='person']"); HtmlNode EMail = author.SelectSingleNode(".//a[@class='envelope']"); if (EMail != null) { outputFile.WriteLine(Name.InnerText + " - " + EMail.Attributes["title"].Value); } } } } return a; }
public static void getHrefs(string url) { // try to fetch href values from a webpage try { // Create an instance of HtmlWeb HtmlAgilityPack.HtmlWeb htmlWeb = new HtmlWeb(); // Creating an instance of HtmlDocument and loading the html source code into it. HtmlAgilityPack.HtmlDocument doc = htmlWeb.Load(url); // Adding the crawled url to the list of crawled urls VisitedPages.Add(url); // For each HTML <a> tag found in the document foreach (HtmlNode link in doc.DocumentNode.SelectNodes("//a[@href]")) { // Extract the href value from the <a> tag Uri l = new Uri(baseUrl, link.Attributes["href"].Value.ToString()); // check if the href value does not exist in the list or the queue and if it is a page of the url the user entered. if (!LinkQueue.Contains(l.ToString()) && !VisitedPages.Contains(l.ToString()) && l.Host.ToString() == baseUrl.Host.ToString()) { // Add the href value to the queue to get scanned. LinkQueue.Enqueue(l.ToString()); } } } catch { // return if anything goes wrong return; } }
public object getData(string url) { HtmlAgilityPack.HtmlWeb web = new HtmlAgilityPack.HtmlWeb(); HtmlAgilityPack.HtmlDocument doc = web.Load(url); var nodes = doc.DocumentNode.SelectNodes("//a[@class='itemDetail']"); return(nodes); }
static void Main(string[] args) { string url = string.Empty; List <Acao> listaAcao = new List <Acao>(); Acao a = new Acao(); url = "https://www.fundamentus.com.br/detalhes.php"; HtmlAgilityPack.HtmlWeb web = new HtmlAgilityPack.HtmlWeb(); web.CacheOnly = false; web.CachePath = null; web.UsingCache = false; HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc = web.Load(url); int i = 0; foreach (HtmlNode row in doc.DocumentNode.SelectNodes("//table[@id='test1']/tbody/tr/td")) { if (i == 0) { a = new Acao(); a.papel = row.InnerText; i++; } else if (i == 1) { a.nomeComercial = row.InnerText; i++; } else if (i == 2) { a.razaoSocial = row.InnerText; listaAcao.Add(a); i = 0; } Console.WriteLine(row.InnerText); } Console.ReadLine(); string path = @"C:\Users\Yuri\Desktop\Stockbook\SQL\v2\listaAcoes.txt"; if (!File.Exists(path)) { // Create a file to write to. using (StreamWriter sw = File.CreateText(path)) { foreach (var item in listaAcao) { //sw.WriteLine("INSERT INTO tb_empresa (nome_comercial, razao_social) VALUES('" + item.nomeComercial + "', '" + item.razaoSocial + "')"); //sw.WriteLine("INSERT INTO tb_acao (ticker, id_empresa) VALUES('" + item.papel + "', (SELECT id_empresa FROM tb_empresa WHERE razao_social = '" + item.razaoSocial + "'))"); } } } }
public Product Scrape(Uri uri) { var web = new HtmlAgilityPack.HtmlWeb(); HtmlDocument doc = web.Load(uri); var x = ExtractPrice(doc); return(new Product(uri, ExtractName(doc))); }
public XPathAgilityPackSelector(string WebUrl) { htmlDocument = new HtmlDocument(); var hw = new HtmlAgilityPack.HtmlWeb(); htmlDocument.OptionOutputAsXml = true; htmlDocument.OptionOutputOptimizeAttributeValues = false; htmlDocument = hw.Load(String.Format("{0}", WebUrl)); baseNode = htmlDocument.DocumentNode; }
public static string ObtainLastMatchScore(string playerName) { string url = $"https://faceitstats.com/player/{playerName}"; var web = new HtmlAgilityPack.HtmlWeb(); HtmlDocument doc = web.Load(url); string score = doc.DocumentNode.SelectNodes("//*[@id=\"app\"]/main/div/div[7]/div/table/tbody/tr[1]/td[3]")[0].InnerText; return(score); }
public static string ObtainFaceitElo(string playerName) { string url = $"https://faceitstats.com/player/{playerName}"; var web = new HtmlAgilityPack.HtmlWeb(); HtmlDocument doc = web.Load(url); string elo = doc.DocumentNode.SelectNodes("//*[@id=\"app\"]/main/div/div[1]/div[2]/div[1]/div/div[1]/h5")[0].InnerText; return(elo); }
public HtmlDocument GetDocument() { HtmlAgilityPack.HtmlWeb doc1 = new HtmlAgilityPack.HtmlWeb(); doc1.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)"; HtmlAgilityPack.HtmlDocument doc2 = doc1.Load(Url); doc2.OptionOutputAsXml = true; doc2.OptionAutoCloseOnEnd = true; doc2.OptionDefaultStreamEncoding = System.Text.Encoding.UTF8; return(doc2); }
public string GetPageTitle(string url) { if (!_urlervice.IsValidUri(url)) { return(string.Empty); } var uri = new Uri(url); var path = _urlervice.GetAbsolutePath(uri); _htmlDoc = _web.Load(path); _htmlDoc.OptionCheckSyntax = true; _htmlDoc.OptionFixNestedTags = true; _htmlDoc.OptionAutoCloseOnEnd = true; _htmlDoc.OptionDefaultStreamEncoding = Encoding.UTF8; var title = _htmlDoc.DocumentNode.SelectSingleNode("/html/head/title"); return(title.InnerText); }
public IActionResult GetUrlsFromAskSearch(string SearchQuery, string PageNumber) { // Example Search URL:https://www.ask.com/web?q=pizza&page=3 // HtmlAgilityPack is nuget Package for Holding Web page for Web scraping // LINK: https://html-agility-pack.net/ HtmlAgilityPack.HtmlWeb web = new HtmlAgilityPack.HtmlWeb(); // PageUrl is the url you passed from the input text. HtmlAgilityPack.HtmlDocument doc = web.Load("https://www.ask.com/web?q=" + SearchQuery.Trim() + "&page=" + PageNumber.Trim()); // Storing the page content in to the variable for further processing string PageContent = doc.Text.ToString(); /*==============================================================*/ // Exrtracting Urls from the page : /*==============================================================*/ //Using RegEx to check URL format in the contents of page. Regex RegForUrls = new Regex(@"(http|ftp|https):\/\/([\w\-_]+(?:(?:\.[\w\-_]+)+))([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?", RegexOptions.IgnoreCase); //find urls that matches which our RegEx i.e RegForUrls MatchCollection URLsMatches = RegForUrls.Matches(PageContent); List <string> URLsList = new List <string>(); //try-catch for handing run time exception. try { foreach (Match FullUrl in URLsMatches) { // don not add urls which has ask.com in domain if (!FullUrl.Value.ToLower().Contains("ask.com")) { // add URLs to list URLsList.Add(FullUrl.Value); } } } catch (Exception ex) { ; } // Remove Duplicate Urls & Assign it to ViewBag.DataUrlList ViewBag.DataUrlList = URLsList.Distinct().ToList(); ViewBag.SearchText = SearchQuery; ViewBag.SearchPN = PageNumber; return(View()); }
public void Test3() { WebClient wc = new WebClient(); string url = "https://finance.naver.com/item/sise.nhn?code=005930"; agi.HtmlWeb web = new agi.HtmlWeb(); agi.HtmlDocument htmlDoc = web.Load(url); var mainNode = htmlDoc.DocumentNode.SelectSingleNode("//strong[@id='_nowVal']"); Console.WriteLine(mainNode.InnerText); }
public XDocument GetXDocument() { HtmlAgilityPack.HtmlWeb doc1 = new HtmlAgilityPack.HtmlWeb(); doc1.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)"; HtmlAgilityPack.HtmlDocument doc2 = doc1.Load(Url); doc2.OptionOutputAsXml = true; doc2.OptionAutoCloseOnEnd = true; doc2.OptionDefaultStreamEncoding = System.Text.Encoding.UTF8; XDocument xdoc = XDocument.Parse(doc2.DocumentNode.SelectSingleNode("html").OuterHtml); return(xdoc); }
public Queue gettitleandbody(object[] url1) { Queue sitenode2 = new Queue(); foreach (var items in url1) { HtmlAgilityPack.HtmlWeb web1 = new HtmlAgilityPack.HtmlWeb(); HtmlAgilityPack.HtmlDocument doc1 = web1.Load(items.ToString()); var mtn1 = doc1.DocumentNode.SelectSingleNode("/html/head/title");///html/body/div[5]/article/div[1]/h1[@class='pg-headline']"); sitenode2.Enqueue(mtn1.InnerText.ToString()); } return(sitenode2); }
static void processUrl(string url) { // Each Thread will store its results locally (threadResults), and once completed will add // the results to the global results static variable (wordCountResults). // Reason: significant reduce of the number of thread locks. Hashtable threadResults = new Hashtable(); HtmlAgilityPack.HtmlWeb hw = new HtmlAgilityPack.HtmlWeb(); if (!url.Contains("http")) { // Adding 'http:' prefix as required by HtmlAgilityPack. url = "http:" + url; } HtmlAgilityPack.HtmlDocument doc = hw.Load(url); string pageContent = System.Net.WebUtility.HtmlDecode(doc.DocumentNode.SelectSingleNode("//body").InnerText); string[] wordList = pageContent.Split(new Char[] { ' ', ',', '.', '·', '(', ')', '[', ']', '{', '}', '?', '¿', '!', '¡', '"', '\'', '/', '-', ':', ';', '=', '+', '\n', '\t' }); // Now we add each word to the thread local results table. foreach (string word in wordList) { // Only considering strings that formed by letters, ie years and signs are filtered out. if (word.All(Char.IsLetter)) { // If the word is new, it is inserted with record count = 1, else count is increased by 1. if (threadResults.Contains(word)) { threadResults[word] = Convert.ToInt32(threadResults[word]) + 1; } else { threadResults.Add(word, 1); } } } // Only after thread has completed its processing, it will merge its results with the global ones. lock (locker) { foreach (DictionaryEntry wordEntry in threadResults) { if (wordCountResults.Contains(wordEntry.Key)) { wordCountResults[wordEntry.Key] = Convert.ToInt32(wordCountResults[wordEntry.Key]) + Convert.ToInt32(wordEntry.Value); } else { wordCountResults.Add(wordEntry.Key, wordEntry.Value); } } } }