Пример #1
0
 protected virtual string ApplyUrlDecoding(string text, UrlDecoding decoding)
 {
     switch (decoding)
     {
         case UrlDecoding.HtmlDecode: return HttpUtility.HtmlDecode(text);
         case UrlDecoding.UrlDecode: return HttpUtility.UrlDecode(text);
         default: return text;
     }
 }
Пример #2
0
 protected virtual string FormatDecodeAbsolutifyUrl(string currentUrl, string matchedUrl, string matchedUrlFormatString, UrlDecoding matchedUrlDecoding)
 {
     // 1. make sure the matched string is not null
     string result = matchedUrl ?? string.Empty;
     // 2. format the matched url when both the format string and the matched url aren't null or empty
     if (!string.IsNullOrEmpty(result) && !string.IsNullOrEmpty(matchedUrlFormatString)) result = string.Format(matchedUrlFormatString, result);
     // 3. decode the match
     result = ApplyUrlDecoding(result, matchedUrlDecoding);
     // 4. build an absolute url when needed
     if (!Uri.IsWellFormedUriString(result, UriKind.Absolute))
     {
         // 4. a) workaround for .net bug when combining uri with a query only
         if (result.StartsWith("?"))
         {
             result = new UriBuilder(currentUrl) { Query = result.Substring(1) }.Uri.ToString();
         }
         else
         {
             Uri uri = null;
             if (Uri.TryCreate(new Uri(currentUrl), result, out uri))
             {
                 result = uri.ToString();
             }
             else
             {
                 result = string.Empty;
             }
         }
     }
     return result;
 }
Пример #3
0
 public override void Execute(IJobExecutionContext context)
 {
     try
     {
         //需要寻找的class
         string[] NeedClass = new string[] { ".provincetr", ".citytr", ".countytr", ".towntr", ".villagetr" };
         Crawler  crawler   = new Crawler();
         crawler.wait    = 2000;
         crawler.Timeout = 10000;
         //crawler.Cookie = "__utmz=207252561.1566065018.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); AD_RS_COOKIE=20082856; __utma=207252561.1090452968.1566065018.1566892516.1566911665.4; __utmc=207252561; __utmt=1; wzws_reurl=L3Rqc2ovdGpiei90anlxaGRtaGN4aGZkbS8yMDE4LzEzLmh0bWw=; __utmb=207252561.2.10.1566911665; wzws_cid=71a25ce145fef3ef24abcd820a9cb0a7362be1e734c83a66d1489e6dd05749fe7d46940c76eda51782a874be4358594674e0ffc98ea6cd85dbd9af045cc53d1d2f71135cff0df7fdf0e26a219f19ffc6";
         crawler.ThreadNum = 5;
         crawler.DoSomeThing((n, que) =>
         {
             Qfun q = HtmlParser.Query(n);
             if (q != null)
             {
                 for (int i = 0; i < NeedClass.Length; i++)
                 {
                     var Class = q(NeedClass[i]);
                     foreach (var items in Class)
                     {
                         var td = q(items).find("td");
                         if (td.length == 2 || td.length == 1)
                         {
                             var a = q(td).find("a");
                             if (a.length > 0)
                             {
                                 RegionalModel regional   = new RegionalModel();
                                 regional.RegionalDataOID = Guid.NewGuid();
                                 regional.ID        = a[0].textContent;
                                 regional.Name      = a[1].textContent;
                                 regional.ParentOID = Guid.Parse(que.pairs.FirstOrDefault(f => f.Key == "ParentOID").Value);
                                 string Url         = a[0].getAttribute("href");
                                 if (Url.IndexOf('/') > -1)
                                 {
                                     Url = Url.Substring(3, Url.Length - 5);
                                 }
                                 else
                                 {
                                     Url = Url.Substring(3, Url.Length - 8);
                                 }
                                 Url = UrlDecoding.Decoding(Url);
                                 crawler.EnQueue(CreateQueue(Url, regional.RegionalDataOID.ToString()));
                                 CreateSQLCommand(regional);
                             }
                             else
                             {
                                 RegionalModel regional   = new RegionalModel();
                                 regional.RegionalDataOID = Guid.NewGuid();
                                 regional.ID        = td[0].textContent;
                                 regional.Name      = td[1].textContent;
                                 regional.ParentOID = Guid.Parse(que.pairs.FirstOrDefault(f => f.Key == "ParentOID").Value);
                                 CreateSQLCommand(regional);
                             }
                         }
                         else if (td.length == 3)
                         {
                             RegionalModel regional   = new RegionalModel();
                             regional.RegionalDataOID = Guid.NewGuid();
                             regional.ID        = td[0].textContent;
                             regional.Name      = td[2].textContent;
                             regional.ParentOID = Guid.Parse(que.pairs.FirstOrDefault(f => f.Key == "ParentOID").Value);
                             CreateSQLCommand(regional);
                         }
                         else
                         {
                             foreach (var item in td)
                             {
                                 var a = q(item).find("a");
                                 foreach (var href in a)
                                 {
                                     RegionalModel regional   = new RegionalModel();
                                     regional.RegionalDataOID = Guid.NewGuid();
                                     regional.Name            = href.textContent;
                                     regional.ParentOID       = Guid.Parse(que.pairs.FirstOrDefault(f => f.Key == "ParentOID").Value);
                                     string Url  = href.getAttribute("href");
                                     regional.ID = Url.Substring(0, Url.Length - 5);
                                     crawler.EnQueue(CreateQueue(Url, regional.RegionalDataOID.ToString()));
                                     CreateSQLCommand(regional);
                                 }
                             }
                         }
                     }
                 }
             }
         }, () => { InsertIntoMSDB(); crawler.Done(); });
         crawler.EnQueue(BeginTask());
         crawler.RunCrawler();
         base.NextTime(context);
     }
     catch (Exception ex)
     {
         log.Error(ex, $"{this.TaskName}同步出错");
     }
 }