/// <summary> /// Moves the CSS embedded in the specified htmlInput to inline style attributes. /// </summary> /// <param name="htmlInput">The HTML input.</param> /// <param name="removeStyleElements">if set to <c>true</c> the style elements are removed.</param> /// <returns>Returns the html input, with styles moved to inline attributes.</returns> public string MoveCssInline(string htmlInput, bool removeStyleElements) { var doc = CQ.CreateDocument(htmlInput); var styleNodes = doc["style"]; if (styleNodes == null || styleNodes.Length == 0) { return(htmlInput); // no styles to move } foreach (var style in styleNodes) { if (style.Attributes["id"] != null && !String.IsNullOrWhiteSpace(style.Attributes["id"]) && style.Attributes["id"].Equals("mobile", StringComparison.InvariantCultureIgnoreCase)) { continue; } var cssParser = new CssParser(); string cssBlock = style.InnerHTML; cssParser.AddStyleSheet(cssBlock); foreach (var rule in cssParser.Styles) { if (rule.Key.StartsWith("@media")) { continue; } var styleClass = rule.Value; var elements = doc[styleClass.Name]; foreach (var element in elements) { if (_elementsWithoutStyle.Contains(element.NodeName.ToLower())) { continue; } var elementStyle = element.Style; if (elementStyle == null) { continue; } StyleClass sc = cssParser.ParseStyleClass("dummy", elementStyle.CssText ?? String.Empty); sc.Merge(styleClass, true); foreach (var attr in sc.Attributes) { elementStyle.SetStyle(attr.Key, attr.Value, false); } } } if (removeStyleElements) { style.Remove(); } } return(doc.Render()); }
public static void ClickPatientCase(IWebDriver driver) { //await Task.Delay(2000);//allow page to render var actualPageSource = CQ.CreateDocument(driver.PageSource); var CaseNumbers = actualPageSource["th[id*='_CaseNumber_']"]; var pairs = CaseNumbers.Select(node => new Tuple <string, string>(node.Id, node.Value)); }
/// <summary> /// Constructor for the PreMailer class /// </summary> /// <param name="html">The HTML input.</param> public PreMailer(string html) { _document = CQ.CreateDocument(html); _warnings = new List <string>(); _cssParser = new CssParser(); _cssSelectorParser = new CssSelectorParser(); }
public async Task GetMembersAsync() { if (this.Url == null) { return; } else { string pageContent = await this.GetPageContentAsync(); var cq = CQ.CreateDocument(pageContent); var villageElements = cq[".villagetr"]; foreach (var villageElement in villageElements) { var village = new Village(); village.Code = villageElement.ChildNodes[0].InnerText; village.CxType = villageElement.ChildNodes[1].InnerText; village.Name = villageElement.ChildNodes[2].InnerText; //village.Father = this; //Members.Add(village); AddMember(village); } } base.GetMembersAsync(); //throw new NotImplementedException(); }
private void GomeDesc() { JObject jsonConf; try { string jsonConfStr = new Regex("(?smi)var prdInfo = {(.*?)};").Match(pageContent).Groups[1].Value.Trim(); jsonConf = JObject.Parse("{" + jsonConfStr + "}"); } catch { throw new Exception("解析 JSON 失败"); } if (jsonConf["htmlHref"] == null) { throw new Exception("详情内容请求URL无法正常获取"); } string descReqUrl = UrlSchemeFull(jsonConf["htmlHref"].ToString()); Log("\n"); LogInfo("开始下载详情内容:" + descReqUrl); string descContent; try { descContent = Utils.GetPageByUrl(descReqUrl).Html; descContent = new Regex("(?smi)\\(\"(.*?)\"\\)").Match(descContent).Groups[1].Value.Trim(); } catch (Exception e) { throw new Exception("详情内容下载失败:" + e.Message); } LogSuccess("详情内容下载完毕"); // Console.WriteLine("详情Html:"+ descContent); var descDom = CQ.CreateDocument(descContent); descDom["img"].Each((i, e) => { AddImgUrl("详情图", e.GetAttribute("src")); }); }
public void GlobalSetup() { var pageContent = File.ReadAllText("page.html"); angleSharpDocument = angleSharpParser.ParseDocument(pageContent); cqDocument = CQ.CreateDocument(pageContent); }
private void TaobaoDesc() { string descReqUrl = new Regex("(?smi)descUrl : location.protocol===\\'http:\\' \\? \\'.*?\\' : \'(.*?)\'").Match(pageContent).Groups[1].Value.Trim(); if (descReqUrl == null) { Log("详情内容请求URL无法正常获取"); return; } descReqUrl = UrlSchemeFull(descReqUrl); Log("\n"); LogInfo("开始下载详情内容:" + descReqUrl); string descContent; try { descContent = Utils.GetPageByUrl(descReqUrl).Html; descContent = new Regex("(?smi)var desc='(.*?)';").Match(descContent).Groups[1].Value.Trim(); } catch (Exception e) { throw new Exception("详情内容下载失败:" + e.Message); } LogSuccess("详情内容下载完毕"); // Console.WriteLine("详情Html:"+ descContent); var descDom = CQ.CreateDocument(descContent); descDom["img"].Each((i, e) => { string picSrcUrl = e.GetAttribute("src"); AddImgUrl("详情图", picSrcUrl); }); }
private void AlibabaDesc() { string descReqUrl = pageDom[".desc-lazyload-container"].Attr("data-tfs-url"); if (descReqUrl == null) { Log("详情内容请求URL无法正常获取"); return; } descReqUrl = UrlSchemeFull(descReqUrl); Log("\n"); LogInfo("开始下载详情内容:" + descReqUrl); string descContent; try { descContent = Utils.GetPageByUrl(descReqUrl).Html; descContent = new Regex("(?smi)var offer_details={(.*?)};").Match(descContent).Groups[1].Value.Trim(); JObject descContentJson = JObject.Parse("{" + descContent + "}"); descContent = descContentJson["content"].ToString(); } catch (Exception e) { throw new Exception("详情内容下载失败:" + e.Message); } LogSuccess("详情内容下载完毕"); // Console.WriteLine("详情Html:"+ descContent); var descDom = CQ.CreateDocument(descContent); descDom["img"].Each((i, e) => { AddImgUrl("详情图", e.GetAttribute("src")); }); }
/// <summary> /// 异步获取成员 /// </summary> /// <returns></returns> public async Task GetMembersAsync() { string pageContent = await GetPageContentAsync(); CsQuery.Config.OutputFormatter = CsQuery.OutputFormatters.HtmlEncodingNone; //CsQuery.Config.OutputFormatter = CsQuery.OutputFormatters.HtmlEncodingNone; var cq = CQ.CreateDocument(pageContent); var a = cq[".provincetr td a"]; foreach (var aa in a) { if (aa.HasAttribute("href")) { var split = Url.Split('/'); split[split.Length - 1] = aa.GetAttribute("href"); var newurl = String.Join("/", split); var newProvince = new Province { Name = aa.InnerText, Url = newurl, //Father = this, Code = String.Format("{0:D2}0000000000", aa.GetAttribute("href").Split('.')[0]) }; AddMember(newProvince); //Members.Add(newProvince); } } base.GetMembersAsync(); }
private void TmallDesc() { JObject jsonConf; try { string jsonStr = new Regex("(?smi)TShop.Setup\\((.*?)\\);").Match(pageContent).Groups[1].Value.Trim(); jsonConf = JObject.Parse(jsonStr); } catch { throw new Exception("解析 JSON 失败"); } if (jsonConf["api"]["descUrl"] == null) { Log("详情内容请求URL无法正常获取"); return; } string descReqUrl = UrlSchemeFull(jsonConf["api"]["descUrl"].ToString()); Log("\n"); LogInfo("开始下载详情内容:" + descReqUrl); string descContent; try { descContent = Utils.GetPageByUrl(descReqUrl).Html; // 下载详情内容 descContent = new Regex("(?smi)var desc='(.*?)';").Match(descContent).Groups[1].Value.Trim(); } catch (Exception e) { throw new Exception("详情内容下载失败:" + e.Message); } LogSuccess("详情内容下载完毕"); // Console.WriteLine("详情Html:"+ descContent); var descDom = CQ.CreateDocument(descContent); descDom["img"].Each((i, e) => { AddImgUrl("详情图", e.GetAttribute("src")); }); }
private static void GetWorkItemFromDom(out string workItemId, out string isLastWorkItem, string result) { var dom = CQ.CreateDocument(result); workItemId = dom["input[name='WORK']"].Val(); isLastWorkItem = dom["input[name='LAST']"].Val(); }
public ParseResults Parse(string content) { CDocument = CQ.CreateDocument(content); var rewardDiv = CDocument.Select("div#reward-layer span.reward-text"); var rewardText = rewardDiv.FirstOrDefault(); // If no reward text handle here // Check to see if we're a loser... if (rewardText.InnerText == "Sorry, you did not win. Please try again later.") { return(new ParseResults { IsWinner = false, IsReplay = false }); } // We're a winner, now go looking for prize winnings var prizeDivs = CDocument.Select("div.winning-card-prize"); var redeemUrl = CDocument.Select("div#reward-layer div.debug a"); var replayDiv = CDocument.Select("div#reward-layer a"); if ((prizeDivs != null) && (prizeDivs.Any())) { return(HandleWinner(prizeDivs, redeemUrl)); } else if ((replayDiv != null) && (replayDiv.Any())) { return(HandleReplay(replayDiv)); } return(null); }
public Document(FetchResult result) { _result = result; _document = CQ.CreateDocument(result.Content); _baseUri = new Lazy <Uri>(GetBaseUri); }
public void AutoCreateTableTags() { var html = @"<table id=table-uda> <tr> <th>Attribute <th>Setter Condition <tr><td><dfn id=dom-uda-protocol title=dom-uda-protocol><code>protocol</code></dfn> <td><a href=#url-scheme title=url-scheme><scheme></a> </tr></table>"; var dom = CQ.Create(html); // should not create wrapper Assert.AreEqual(0, dom["body"].Length); Assert.AreEqual(0, dom["head"].Length); AutoCreateTests(dom); dom = CQ.CreateDocument(html); // should create wrapper Assert.AreEqual(1, dom["body"].Length); Assert.AreEqual(1, dom["html"].Length); Assert.AreEqual(1, dom["head"].Length); Assert.AreEqual(Arrays.Create("HEAD", "BODY"), dom["html > *"].Select(item => item.NodeName)); AutoCreateTests(dom); }
public GameInfoModel GetDataFromApi(string filename) { var platform = "?platformID[]=11"; var name = "&name=" + filename.Replace(" ", "+"); var response = ScrapeArt.WebClient(BaseUrl, p => p.DownloadString("search.php" + platform + name) ); var html = CQ.CreateDocument(response); if (html["div#display"].Children().Any() == false) { return(null); } var sanitizedSearchFilename = SanitizeSeach(filename); var resultRoot = html["div#display div > a[href]"]; var results = IterateResults(resultRoot).ToArray(); // result is in the wrong order, so i attempt to find the most relevant of the results var orderedByRelevance = ( from item in results let sanitizedSerachTitle = SanitizeSeach(item.Title) let levenshteinDistance = LevenshteinDistance.Compute(sanitizedSearchFilename, sanitizedSerachTitle) let lengthDifference = sanitizedSearchFilename.Length - sanitizedSerachTitle.Length orderby lengthDifference orderby levenshteinDistance select item ).ToArray(); return(orderedByRelevance.FirstOrDefault()); }
public void AutoCreateHtmlBody() { string test = @"<html> <head> <script type=""text/javascript"">lf={version: 2064750,baseUrl: '/',helpHtml: '<a class=""email"" href=""mailto:xxxxx@xxxcom"">email</a>',prefs: { pageSize: 0}}; lf.Scripts={""crypt"":{""path"":""/scripts/thirdp/sha512.min.2009762.js"",""nameSpace"":""Sha512""}}; </script><link rel=""icon"" type=""image/x-icon"" href=""/favicon.ico""> <title>Title</title> <script type=""text/javascript"" src=""/scripts/thirdp/jquery-1.7.1.min.2009762.js""></script> <script type=""text/javascript"">var _gaq = _gaq || []; _gaq.push(['_setAccount', 'UA-xxxxxxx1']); _gaq.push(['_trackPageview']); </script> </head> <body> <script type=""text/javascript""> alert('done'); </script>"; var dom = CQ.CreateDocument(test); Assert.AreEqual(4, dom["script"].Length); }
private IEnumerable <WikipediaCountry> ExtractFromTables(string html) { var doc = CQ.CreateDocument(html); foreach (var t in doc.Select("#bodyContent table.wikitable tbody")) { //TODO: Be a bit more selective on which tables to use (not only cells.length>4 but "scan" header-row for specific text for example) foreach (var r in t.ChildElements) { if (r.FirstElementChild != null && r.FirstElementChild.NodeName.Equals("TD")) { var cells = r.ChildElements.ToArray(); //Do we have enough data? if (cells.Length >= 4) { yield return(new WikipediaCountry { CountryName = cells[0].LastChild.Cq().Text(), Alpha2 = cells[1].Cq().Text(), Alpha3 = cells[2].Cq().Text(), Numeric = cells[3].Cq().Text(), }); } } } } ; }
private static async Task Start() { var client = new RestClient(url); var response = await client.ExecuteAsync(new RestRequest()); var now = DateTime.Now; if (response.IsSuccessful) { var content = response.Content; var dom = CQ.CreateDocument(content); var numbers = dom[".maincounter-number"].Selection.ToList(); if (numbers.Count > 0) { GetCounts(numbers, out var deathCount, out var caseCount); Calculations(caseCount, deathCount, now); cases.Add(new CoronaCases { Cases = caseCount, Deaths = deathCount, DataTime = now }); WriteData(); } } await Task.Delay(timeToSleep); await Start(); }
public async Task GetMembersAsync() { string pageContent = await this.GetPageContentAsync(); CsQuery.Config.OutputFormatter = CsQuery.OutputFormatters.HtmlEncodingNone; //CsQuery.Config.OutputFormatter = CsQuery.OutputFormatters.HtmlEncodingNone; var cq = CQ.CreateDocument(pageContent); var cityElements = cq[".citytr"]; foreach (var cityElement in cityElements) { var city = new City(); city.Code = Code = cityElement.FirstChild.FirstChild.InnerText; //city.Father = this; if (cityElement.FirstChild.FirstChild.HasAttribute("href")) { var split = Url.Split('/'); split[split.Length - 1] = cityElement.FirstChild.FirstChild.GetAttribute("href"); var newurl = String.Join("/", split); city.Url = newurl; } city.Name = cityElement.ChildNodes[1].FirstChild.InnerText; //Members.Add(city); AddMember(city); } base.GetMembersAsync(); }
public ParsedDocument(string html) { Document = CQ.CreateDocument(html); Html = html; Links = new List <WebPage>(); Items = new List <ContentItem>(); }
public void FlotVisualization_CanReadAppFromResource() { var html = FlotWebApp.GetFlotApp(); html.Should().NotBeEmpty(); Assert.DoesNotThrow(() => CQ.CreateDocument(html)); }
public void Issue145() { var dom = @"<html xmlns=""http://www.w3.org/1999/xhtml"" xmlns:xi=""http://www.w3.org/2001/XInclude""><body></html>"; var cq = CQ.CreateDocument(dom); Assert.AreEqual(cq["html"].Attr("xmlns:xi"), "http://www.w3.org/2001/XInclude"); }
public void InnerTextGet2() { CQ doc = CQ.CreateDocument(testHtml); var text = doc["body"][0].InnerText; Assert.AreEqual("This text has a link" + Environment.NewLine + "a new block", text); }
public override void BeginWork() { base.BeginWork(); // 参数设定 PageUrl = GetParm("PageUrl"); // 若使用 new Uri() 会把 urlencode 的参数自动 decode PageType = GetParm("PageType"); ImgType = GetParm("ImgType"); CollType = GetParm("CollType"); // 下载页面 LogInfo("开始下载:" + PageUrl); // 选择页面格式 Dictionary <string, string> headers = new Dictionary <string, string> { }; Encoding encoding = Encoding.GetEncoding("UTF-8"); if (PageType.Equals("Alibaba")) { // 获取 Cookie var cgSettings = new NacollectorUtils.Settings.CookieGetterSettings { StartUrl = "https://login.1688.com/member/signin.htm?from=sm&Done=" + HttpUtility.UrlEncode(PageUrl), EndUrlReg = @"^" + PageUrl.Substring(0, PageUrl.IndexOf("?")), Caption = "登录 1688", }; cgSettings.UseInputAutoComplete(@"^https://login\.1688.com/member/signin\.htm", new List <string>() { "#TPL_username_1", "#TPL_password_1" }); // ... Show Dialog Working string alibabaCookieStr = GetSpiderSettings().CrBrowserCookieGetter(cgSettings); if (string.IsNullOrEmpty(alibabaCookieStr)) { throw new Exception("Cookie 获取未成功"); } encoding = Encoding.GetEncoding("gb2312"); headers.Add("cookie", alibabaCookieStr); } var downloadPage = Utils.GetPageByUrl(PageUrl, headers, null, encoding); if (downloadPage.StatusCode != System.Net.HttpStatusCode.OK) { throw new Exception("下载失败 [" + downloadPage.StatusCode + "] " + downloadPage.StatusDescription); } pageContent = downloadPage.Html; LogSuccess("下载完毕"); pageDom = CQ.CreateDocument(pageContent); // 调用指定方法 this.GetType().GetMethod(PageType + ImgType, BindingFlags.NonPublic | BindingFlags.Instance).Invoke(this, new object[] { }); // 显示&采集 AfterGetImgUrl(); }
// ReSharper disable ExceptionNotThrown /// <exception cref="HttpRequestException">Recieved an error status code that could not be recovered from.</exception> /// <exception cref="WebException">Request failed due to a bad connection (e.g. no internet).</exception> /// <exception cref="InvalidUsernameAndPasswordException">The server rejected the login credentials.</exception> private async Task <CookieContainer> LoginViaAuthPortalGateway(Uri loginGatewayUri, string usernameOrEmail, string password) { var cookies = new CookieContainer(); var client = new HttpClient(new HttpClientHandler { AllowAutoRedirect = true, UseCookies = true, CookieContainer = cookies }); // Retrieve the login page by using the gateway var loginPageResponse = await client.SendAsync(new HttpRequestMessage(HttpMethod.Get, loginGatewayUri)); loginPageResponse.EnsureSuccessStatusCode(); var authPage = await loginPageResponse.Content.ReadAsStringAsync(); var authPageDom = CQ.CreateDocument(authPage); var form = authPageDom.Select(".login form").First(); // Prepare to send the form found in the login page // Select the hidden inputs so they can be included in the actual login POST request. var hiddenInputs = form .Select("input[type=hidden]") .Select(element => new KeyValuePair <string, string>( element.GetAttribute("name"), element.GetAttribute("value"))); var formFields = hiddenInputs.Concat(new[] { new KeyValuePair <string, string>("username", usernameOrEmail), new KeyValuePair <string, string>("password", password), }); var formRequest = new FormUrlEncodedContent(formFields); var formAction = new Uri(form.Attr("action")); var formMethod = new HttpMethod(form.Attr("method")); var sendFormMsg = new HttpRequestMessage(formMethod, formAction) { Content = formRequest }; var response = await client.SendAsync(sendFormMsg); /** * The /login/process handler is a little odd in that both auth * success and auth failure have a 200 OK status code. * We determine if authorizing worked by checking if we were * redirected to the same login form. */ response.EnsureSuccessStatusCode(); // Did we recieve the same login form response again? if (response.RequestMessage.RequestUri.Host == loginPageResponse.RequestMessage.RequestUri.AbsolutePath) { // Login failed and the server resent the login form with an error message. throw new InvalidUsernameAndPasswordException("The server rejected the login credentials."); } return(cookies); }
public void DefaultTypeValue() { var dom = CQ.CreateDocument(@"<input id='input1' /><input id='input2' type='text' />"); Assert.AreEqual(1, dom["input[type=text]"].Length); Assert.AreEqual(1, dom["[type=text]"].Length); Assert.IsFalse(dom["#input1"].Is("[type=text]")); Assert.IsTrue(dom["#input2"].Is("[type=text]")); }
/// <summary> /// Process the mock web request synchronously using same rules as CreateFromUrl /// </summary> /// /// <param name="request"> /// The request. /// </param> /// /// <returns> /// . /// </returns> private CQ ProcessMockWebRequestSync(CsqWebRequest request) { var httpRequest = request.GetWebRequest(); var response = httpRequest.GetResponse(); var responseStream = response.GetResponseStream(); var encoding = CsqWebRequest.GetEncoding(response); return(CQ.CreateDocument(responseStream, encoding)); }
public void ValidatePage(string pageName, string pageText) { Results.AppendResults(ValidateSingleFile(pageName, pageText)); Results.HtmlValidationCompleted = true; var dom = CQ.CreateDocument(pageText); var links = dom["link[rel='stylesheet']"]; foreach (var link in links) { var linkHref = link.Attributes["href"] as string; if (string.IsNullOrWhiteSpace(linkHref)) { continue; } linkHref = linkHref.Trim(); if (_checkedFiles.Contains(linkHref)) { continue; } _checkedFiles.Add(linkHref); var isRelative = !linkHref.Contains("//"); if (isRelative) { var linkUrl = _baseUri + linkHref; // Get CSS File string linkText = null; try { var req = HttpWebRequest.Create(linkUrl); using (var rsp = req.GetResponse()) { using (var rdr = new StreamReader(rsp.GetResponseStream())) { linkText = rdr.ReadToEnd(); } } Results.AppendResults(ValidateSingleFile(linkHref, linkText, true)); } catch (Exception e) { Results.W3CCssValidationMessagesNew.Add(new W3CHtmlValidationMessage() { Page = linkHref, Type = W3CHtmlValidationMessage.MessageTypes.NonDocumentError, Message = $"Unable to perform CSS Validation" }); Elmah.ErrorSignal.FromCurrentContext().Raise(e); } } } Results.CssValidationCompleted = true; }
public List <string> RetrieveVideoSource(EpisodeResult episodeResult) { var html = _client.DownloadString(BaseUri + episodeResult.PageUrl); var dom = CQ.CreateDocument(html); var results = new List <string> { BaseUri + dom.Select("video").Attr("src").Replace(" ", "%20") }; return(results); }
public void TestDocTypeXHTML() { var dom = CQ.CreateDocument("<!doctype html >"); dom.Document.DocTypeNode = dom.Document.CreateDocumentType(DocType.XHTMLStrict); var xhtmlStrict = @"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 Strict//EN"" ""http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"">"; Assert.AreEqual(xhtmlStrict, dom.First().Render()); }