protected virtual string ApplyUrlDecoding(string text, UrlDecoding decoding) { switch (decoding) { case UrlDecoding.HtmlDecode: return HttpUtility.HtmlDecode(text); case UrlDecoding.UrlDecode: return HttpUtility.UrlDecode(text); default: return text; } }
protected virtual string FormatDecodeAbsolutifyUrl(string currentUrl, string matchedUrl, string matchedUrlFormatString, UrlDecoding matchedUrlDecoding) { // 1. make sure the matched string is not null string result = matchedUrl ?? string.Empty; // 2. format the matched url when both the format string and the matched url aren't null or empty if (!string.IsNullOrEmpty(result) && !string.IsNullOrEmpty(matchedUrlFormatString)) result = string.Format(matchedUrlFormatString, result); // 3. decode the match result = ApplyUrlDecoding(result, matchedUrlDecoding); // 4. build an absolute url when needed if (!Uri.IsWellFormedUriString(result, UriKind.Absolute)) { // 4. a) workaround for .net bug when combining uri with a query only if (result.StartsWith("?")) { result = new UriBuilder(currentUrl) { Query = result.Substring(1) }.Uri.ToString(); } else { Uri uri = null; if (Uri.TryCreate(new Uri(currentUrl), result, out uri)) { result = uri.ToString(); } else { result = string.Empty; } } } return result; }
public override void Execute(IJobExecutionContext context) { try { //需要寻找的class string[] NeedClass = new string[] { ".provincetr", ".citytr", ".countytr", ".towntr", ".villagetr" }; Crawler crawler = new Crawler(); crawler.wait = 2000; crawler.Timeout = 10000; //crawler.Cookie = "__utmz=207252561.1566065018.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); AD_RS_COOKIE=20082856; __utma=207252561.1090452968.1566065018.1566892516.1566911665.4; __utmc=207252561; __utmt=1; wzws_reurl=L3Rqc2ovdGpiei90anlxaGRtaGN4aGZkbS8yMDE4LzEzLmh0bWw=; __utmb=207252561.2.10.1566911665; wzws_cid=71a25ce145fef3ef24abcd820a9cb0a7362be1e734c83a66d1489e6dd05749fe7d46940c76eda51782a874be4358594674e0ffc98ea6cd85dbd9af045cc53d1d2f71135cff0df7fdf0e26a219f19ffc6"; crawler.ThreadNum = 5; crawler.DoSomeThing((n, que) => { Qfun q = HtmlParser.Query(n); if (q != null) { for (int i = 0; i < NeedClass.Length; i++) { var Class = q(NeedClass[i]); foreach (var items in Class) { var td = q(items).find("td"); if (td.length == 2 || td.length == 1) { var a = q(td).find("a"); if (a.length > 0) { RegionalModel regional = new RegionalModel(); regional.RegionalDataOID = Guid.NewGuid(); regional.ID = a[0].textContent; regional.Name = a[1].textContent; regional.ParentOID = Guid.Parse(que.pairs.FirstOrDefault(f => f.Key == "ParentOID").Value); string Url = a[0].getAttribute("href"); if (Url.IndexOf('/') > -1) { Url = Url.Substring(3, Url.Length - 5); } else { Url = Url.Substring(3, Url.Length - 8); } Url = UrlDecoding.Decoding(Url); crawler.EnQueue(CreateQueue(Url, regional.RegionalDataOID.ToString())); CreateSQLCommand(regional); } else { RegionalModel regional = new RegionalModel(); regional.RegionalDataOID = Guid.NewGuid(); regional.ID = td[0].textContent; regional.Name = td[1].textContent; regional.ParentOID = Guid.Parse(que.pairs.FirstOrDefault(f => f.Key == "ParentOID").Value); CreateSQLCommand(regional); } } else if (td.length == 3) { RegionalModel regional = new RegionalModel(); regional.RegionalDataOID = Guid.NewGuid(); regional.ID = td[0].textContent; regional.Name = td[2].textContent; regional.ParentOID = Guid.Parse(que.pairs.FirstOrDefault(f => f.Key == "ParentOID").Value); CreateSQLCommand(regional); } else { foreach (var item in td) { var a = q(item).find("a"); foreach (var href in a) { RegionalModel regional = new RegionalModel(); regional.RegionalDataOID = Guid.NewGuid(); regional.Name = href.textContent; regional.ParentOID = Guid.Parse(que.pairs.FirstOrDefault(f => f.Key == "ParentOID").Value); string Url = href.getAttribute("href"); regional.ID = Url.Substring(0, Url.Length - 5); crawler.EnQueue(CreateQueue(Url, regional.RegionalDataOID.ToString())); CreateSQLCommand(regional); } } } } } } }, () => { InsertIntoMSDB(); crawler.Done(); }); crawler.EnQueue(BeginTask()); crawler.RunCrawler(); base.NextTime(context); } catch (Exception ex) { log.Error(ex, $"{this.TaskName}同步出错"); } }