private void GetUrlText_1(Class1[] RTc, IHtmlDocument document) { IEnumerable <IHtmlElement> result = document.Find("div").Where(d => d.Identity() == "main"); IEnumerable <IHtmlElement> t = result.Find("tr"); Dictionary <string, string> dir = new Dictionary <string, string>(); int r = 0; foreach (var item in t) { #region MyRegion Class1 _class = new Class1(); IHtmlElement item_a = item.FindFirst(".bthead>a"); _class.TextName = item_a.InnerText().Trim(); _class.href = item_a.Attribute("href").Value().Trim(); _class.Quyu = GetN_value(item, ".a_xq1"); string st = GetN_value(item, ".qj-lijjrname"); _class.PersonName = st.Replace(":", "").Trim();; _class.Laiyuan = GetN_value(item, "label"); string[] str_ = GetN_value(item, ".qj-listleft").Split(' '); List <string> str_1 = LIST_G(str_); _class.Address = str_1[2]; //_class.SumMoney = item.FindFirst(".pri").InnerText(); string[] ssp = GetN_value(item, ".qj-listright").Split(' '); int j = ssp.Length == 10 ? 0 : 10 - ssp.Length; _class.SumMoney = ssp[1 - j] + ssp[2 - j]; _class.PingMoney = ssp[4 - j]; _class.Allpm = ssp[7 - j] + ssp[9 - j]; string[] datetime = GetN_value(item, ".qj-listjjr").Split(' '); _class.datetime = datetime.Length == 3 ? datetime[2] : ""; RTc[r] = _class; r++; #endregion } }
/// <summary> /// 获取同一个Class的内容 /// </summary> /// <param name="cssKey">css样式关键字</param> /// <returns></returns> public List <string> GetClassTxt(string cssKey) { try { List <string> returnValue = new List <string>(); if (!doc.Exists(cssKey)) { return(null); } var values = doc.Find(cssKey); if (values == null) { return(null); } foreach (var item in values) { returnValue.Add(item.InnerText()); } return(returnValue); } catch { return(null); } }
/// <summary> /// 清除文档中所有的资源文件引用 /// </summary> /// <param name="document">要清除资源文件引用的文档</param> /// <param name="headScopeOnly">是否仅清除 <head> 元素内部的引用</param> public void ClearAllReference(IHtmlDocument document, bool headScopeOnly = true) { if (document == null) { return; } if (headScopeOnly) { document.Find("head link[rel=stylesheet][href$=.css], head script[src$=.js]").Remove(); } else { document.Find("link[rel=stylesheet][href$=.css], script[src$=.js]").Remove(); } }
protected virtual IEnumerable <string> FindLinks(IHtmlDocument document) { document.ResolveUriToAbsoluate(); foreach (var href in document.Find("a[href]").Select(element => element.Attribute("href").Value())) { var uri = new UriBuilder(new Uri(document.DocumentUri, href)); uri.Fragment = null; yield return(uri.Uri.AbsoluteUri); } }
public static TargetModel Modular02(IHtmlDocument sorceIhtml, TargetModel model, int i) { //资本相关信息 var tdHtml_zb = sorceIhtml.Find(".f-lbiao").ElementAt(i).Find("tr td").ToList(); //foreach (var td in tdHtml_zb) //{ // Console.WriteLine(td.InnerText()); //} model.capitalInfo = FillModel <CapitalInfo>(tdHtml_zb); return(model); }
public static TargetModel Modular03(IHtmlDocument sorceIhtml, TargetModel model, int i) { //组织机构代码信息 var tdHtml_orgCode = sorceIhtml.Find(".f-lbiao").ElementAt(i).Find("tr td").ToList(); //foreach (var td in tdHtml_orgCode) //{ // Console.WriteLine(td.InnerText()); //} model.orgCodeInfo = FillModel <OrgCodeInfo>(tdHtml_orgCode); return(model); }
/// <summary> /// 查找部分视图渲染范畴 /// </summary> /// <param name="document">加载的文档</param> /// <returns>渲染范畴</returns> protected virtual IHtmlContainer GetPartialScope(IHtmlDocument document) { var body = document.Find("body").SingleOrDefault(); if (body == null) { return(document); } else { return(body); } }
//mshtml.HTMLDocument mhtml = (mshtml.HTMLDocument)Webbrowser2.Document; //string html = mhtml.body.innerHTML; //IHtmlDocument document = new JumonyParser().Parse(html); //IEnumerable<IHtmlElement> result = document.Find("div").Where(i => i.Identity() == "all-list"); //result = result.Find("li"); // foreach (var li in result) // { // Class1 _class = new Class1(); //_class.TextName = GetN_value(li, ".media-body-title>a"); //_class.href = li.FindFirst(".media-body-title>a").Attribute("href").Value(); //_class.Quyu = "百姓"; // _class.Laiyuan = "百姓"; // _class.Address = GetN_value(li, ".typo-small"); //string this_spk = GetN_value(li, ".typo-small"); //_class.SumMoney = GetN_value(li, ".highlight"); //string[] thip = this_spk.Split('/'); //_class.PingMoney = thip.Length>1? thip[1]:""; // _class.Allpm = ""; // _class.datetime = GetN_value(li, ".pull-right"); // if (li.Exists("img")) // { // if (li.FindFirst("img").Attribute("src").Value().IndexOf("http") < 0) // { // _class.Image_str = string.Empty; // } // else // { // _class.Image_str = "有"; // } // } // else if (li.Exists(".img")) // { // if (li.FindFirst(".img").Attribute("src").Value().IndexOf("http") < 0) // { // _class.Image_str = string.Empty; // } // else // { // _class.Image_str = "有"; // } // } // else // { // _class.Image_str = string.Empty; // } // L_Class.Add(_class); // } #endregion #endregion #region 读取网站区域 private void GetSelectQuYu(IHtmlDocument document) { IEnumerable <IHtmlElement> result = document.Find("div").Where(i => i.Identity() == "qySelectFirst").Find("a"); List <URL_list> llt = new List <URL_list>(); foreach (var rt in result) { Quyu urt = new Quyu(); urt.ID = Id; urt.Name = rt.InnerText(); urt.Href = rt.Attribute("href").Value(); Quyulist_58.Add(urt); } }
/// <summary> /// 使用CSS选择器解析数据 /// </summary> /// <param name="doc"></param> /// <param name="count"></param> private static void ParseDataWithCss(IHtmlDocument doc, ref int count) { IEnumerable <IHtmlElement> books = doc.Find("body > div.sokk-body > div > div > div.col-sm-9.col-md-10.col-lg-10.col-xs-12 > div > div.books > div > div"); foreach (IHtmlElement bookInfo in books) { var book = new Book(); foreach (IHtmlElement info in bookInfo.Find("div > div > div > div.title >a")) { book.Title = info.InnerText(); book.Url = new Uri("http://www.yousuu.com" + info.Attribute("href").Value()); break; } foreach (IHtmlElement info in bookInfo.Find("div > div > div > div.rating > span:nth-child(3)")) { book.RateNumber = info.InnerText(); } foreach (IHtmlElement info in bookInfo.Find("div > div > div > div.abstract")) { string text = info.ToString(); var li = text.Split(':', ':', '>', '<'); if (li.Length != 19) { book = null; break; } book.Author = li[3]; book.WordNumber = li[6]; book.UpdateTime = li[9]; book.Score = li[14]; } lock (bookList) { if (book != null && !bookList.Contains(book)) { count++; bookList.Add(book);//将数据加入到泛型列表 } } //Console.WriteLine(book?.ToString());//将书籍信息显示到控制台 } }
/// <summary> /// 这个方法是用来添加<![CDATA[<meta name="generator" value="jumony" />]]>元素的。 /// </summary> private void AddGeneratorMetaData(IHtmlDocument document) { var modifier = document.DomModifier; if (modifier != null) { var header = document.Find("html head").FirstOrDefault(); if (header != null) { var metaElement = modifier.AddElement(header, "meta"); metaElement.SetAttribute("name", "generator"); metaElement.SetAttribute("content", "Jumony"); } } }
public static TargetModel Modular04(IHtmlDocument sorceIhtml, TargetModel model, int x) { //税务登记信息 var trHtml_tax = sorceIhtml.Find(".f-lbiao").ElementAt(x).Find("tr").ToList(); foreach (var tr in trHtml_tax) { var sorceth = tr.Find("th"); var sorcetd = tr.Find("td"); //for (var i = 0; i < sorceth.Count(); i++) //{ // Console.WriteLine(sorceth.ElementAt(i).InnerText()); // Console.WriteLine(sorcetd.ElementAt(i).InnerText()); //} } model.taxInfo = FillModel <TaxInfo>(trHtml_tax.Find("th").ToList(), trHtml_tax.Find("td").ToList()); return(model); }
private void GetUrlText_2(IHtmlDocument document, List <Class1> L_Class) { IEnumerable <IHtmlElement> result1 = document.Find(".list-items"); foreach (var item in result1) { #region MyRegion Class1 _class = new Class1(); IHtmlElement item_a = item.FindFirst("a"); string img_str = item.Exists("img") ? item.FindFirst("img").Attribute("src").Value() : ""; _class.Image_Count = img_str.Length > 0 ? img_str.IndexOf("default.jpg") > 0?0:1:0; IEnumerable <IHtmlElement> div = item.Find("div"); List <string> ls = new List <string>(); foreach (var d in div) { ls.Add(d.InnerText()); } _class.TextName = ls[2]; _class.SumMoney = ls[4]; _class.Quyu = "赶集"; _class.Allpm = ls[1]; _class.Address = ls[0]; _class.href = item_a.Attribute("href").Value().Trim(); IEnumerable <IHtmlElement> ssa = item_a.Find("span"); string item_aa = item_a.ToString().Replace("<!--", "stu1").Replace("-->", "stp2"); item_aa = item_aa.Substring(item_aa.IndexOf("stu1") + 4); item_aa = item_aa.Substring(0, item_aa.IndexOf("stp2")); item_aa = item_aa.Substring(item_aa.IndexOf(">") + 1); item_aa = item_aa.Substring(0, item_aa.IndexOf("<")); _class.datetime = item_aa; string[] pm = _class.Allpm.Split(' '); double pm_int = Convert.ToDouble(pm[6].Replace('㎡', ' ').Trim().Length <= 0 ? pm[5].Replace('㎡', ' ').Trim() : pm[6].Replace('㎡', ' ').Trim()); double ss = ((Convert.ToDouble(_class.SumMoney.Replace("万元", "").Trim()) / pm_int)); _class.PingMoney = "≈" + Convert.ToInt32(ss * 10000).ToString(); _class.Image_str = _class.Image_Count > 0 ? "有" : string.Empty; L_Class.Add(_class); r++; #endregion } }
public static IEnumerable <LinkedResource> GetResources(IHtmlDocument document) { foreach (var element in document.Find("[src]")) { var attribute = element.Attribute("src"); var value = attribute.Value(); if (string.IsNullOrWhiteSpace(value)) { continue; } Uri resourceUrl; if (!Uri.TryCreate(document.DocumentUri, value, out resourceUrl)) { continue; } yield return(LoadResource(resourceUrl)); } }
private void TestSelector( IHtmlDocument document, string selector ) { TestContext.WriteLine( "Selector \"{0}\" seleted {1} elements", selector, document.Find( selector ).Count() ); }
/// <summary> /// 查找部分视图渲染范畴 /// </summary> /// <param name="document">加载的文档</param> /// <returns>渲染范畴</returns> protected virtual IHtmlContainer GetPartialScope( IHtmlDocument document ) { var body = document.Find( "body" ).SingleOrDefault(); if ( body == null ) return document; else return body; }
private void TestSelector(IHtmlDocument document, string selector) { TestContext.WriteLine("Selector \"{0}\" seleted {1} elements", selector, document.Find(selector).Count()); }
/// <summary> /// 在文档范围内使用选择器查找符合要求的元素 /// </summary> /// <param name="selector">CSS选择器表达式</param> /// <returns>符合选择器要求的元素</returns> protected IEnumerable <IHtmlElement> Find(string selector) { return(Document.Find(selector)); }
/// <summary> /// 这个方法是用来添加<![CDATA[<meta name="generator" value="jumony" />]]>元素的。 /// </summary> private void AddGeneratorMetaData( IHtmlDocument document ) { var modifier = document.DomModifier; if ( modifier != null ) { var header = document.Find( "html head" ).FirstOrDefault(); if ( header != null ) { var metaElement = modifier.AddElement( header, "meta" ); metaElement.SetAttribute( "name", "generator" ); metaElement.SetAttribute( "content", "Jumony" ); } } }
public static IEnumerable<LinkedResource> GetResources( IHtmlDocument document ) { foreach ( var element in document.Find( "[src]" ) ) { var attribute = element.Attribute( "src" ); var value = attribute.Value(); if ( string.IsNullOrWhiteSpace( value ) ) continue; Uri resourceUrl; if ( !Uri.TryCreate( document.DocumentUri, value, out resourceUrl ) ) continue; yield return LoadResource( resourceUrl ); } }
/// <summary> /// 清除文档中所有的资源文件引用 /// </summary> /// <param name="document">要清除资源文件引用的文档</param> /// <param name="headScopeOnly">是否仅清除 <head> 元素内部的引用</param> public void ClearAllReference( IHtmlDocument document, bool headScopeOnly = true ) { if ( document == null ) return; if ( headScopeOnly ) document.Find( "head link[rel=stylesheet][href$=.css], head script[src$=.js]" ).Remove(); else document.Find( "link[rel=stylesheet][href$=.css], script[src$=.js]" ).Remove(); }