public override PhaseResult Run(Context context) { PhaseResult pr = new PhaseResult(this); pr.ListResult = List; return pr; }
public override PhaseResult Run(Context context) { GeckoWebBrowser browser = (GeckoWebBrowser)context.GetService(typeof(GeckoWebBrowser)); Debug.Assert(browser != null, "browser is null"); String regex = context.Resolve(RegularExpression); Debug.Assert(String.IsNullOrWhiteSpace(regex), "regular expression is null!"); String url = browser.Document.Uri; Match match = Regex.Match(url, regex); PhaseResult pr = new PhaseResult(this); if (match.Success) { // 表达式中所有的Group的id名 List<String> ids = RegexHelper.ParseGroupIndexNames(regex); // 按照Group的名字写入到ParameterProvider foreach (String id in ids) { context.ParameterProvider.SetString(id, match.Groups[id].Value); } pr.Succeed = true; pr.SetInt(Constant.RVCount, ids.Count); } else { pr.Succeed = false; } return pr; }
/// <summary> /// context里的RuntimeProvider应该已经设置好 /// </summary> /// <returns></returns> private List<String> GenerateOneBatch(Context context) { Int32 from; Int32 to; Int32 step; List<String> result = new List<string>(); if (!Int32.TryParse(From, out from)) { throw new ArgumentException("From is corrupted."); } if (!Int32.TryParse(Step, out step)) { throw new ArgumentException("Step is corrupted."); } // 如果To不是一个数字,则是一个正则表达式,先使用FirstPage进行请求html if (!Int32.TryParse(To, out to)) { if (FirstPage == null) { String msg = "When To is a regular expression, FirstPage is expected."; throw new ArgumentException(msg); } PhaseResult html = FirstPage.Run(context); context.PushResult(html); Match match = Regex.Match(context.LastRequestContent, To); if (!match.Success || !Int32.TryParse(match.Groups[1].Value, out to)) { to = from; } } //String pattern = context.Resolve(Pattern); for (int i = from; i <= to; i += step) { Dictionary<String, String> dict = new Dictionary<string, string>(); foreach (Pattern pattern in Patterns) { String resolvedPattern = context.Resolve(pattern.RawPattern); if (ParameterResolver.HasDataBinding(resolvedPattern, Constant.RuntimePrefix)) { return null; } String gen = resolvedPattern.Replace(Constant.UpdatablePlaceHolder, i.ToString()); dict.Add(pattern.Name, gen); } result.Add(JsonConvert.SerializeObject(dict)); } if (Save) { context.JsonResult.AddRange(result); } return result; }
public PhaseResult Run(Context context) { List<String> bind = null; if (!String.IsNullOrWhiteSpace(Binding)) { if (context.JsonResults.ContainsKey(Binding)) { bind = context.JsonResults[Binding]; } else { bind = new List<string>(); } } else { PhaseResult last = context.Stack.LastOrDefault(); if (last != null) { bind = last.ListResult; } } PhaseResult pr = new PhaseResult(this); pr.ListResult = new List<string>(); pr.Succeed = true; if (bind == null) { pr.Succeed = false; } else { // 将Json中PDF的Url下载并替换成文字 Int32 downloadSucc = 0; foreach (String json in bind) { Dictionary<String, String> dict = JsonConvert.DeserializeObject<Dictionary<String, String>>(json); Boolean succ = false; if (dict.ContainsKey(UrlTagName)) { String url = dict[UrlTagName]; succ = DownloadHelper.DownloadFile(url, Directory); if (!succ) { succ = DownloadHelper.DownloadFile(url, Directory); } } downloadSucc += succ ? 1 : 0; } pr.SetInt("download", downloadSucc); pr.SetInt("total", bind.Count); } return pr; }
public PhaseResult Run(Context context) { Account account = new Account(); account.UserName = context.Resolve(Username); account.Password = context.Resolve(Password); context.Account = account; PhaseResult pr = new PhaseResult(this); pr.Succeed = true; return pr; }
public override PhaseResult Run(Context context) { GeckoWebBrowser browser = (GeckoWebBrowser)context.GetService(typeof(GeckoWebBrowser)); Debug.Assert(browser != null, "browser is null"); Locator.Locator = context.Resolve(Locator.Locator); Boolean succ = RequestHelper.OperateBrowserClick(browser, Locator); PhaseResult pr = new PhaseResult(this); pr.Succeed = succ; context.LastRequestContent = RequestHelper.GetGeckoContent(browser) ?? String.Empty; return pr; }
/// <summary> /// /// </summary> /// <param name="context"></param> /// <returns>Succeed表示是否成功,ListResult表示抓到的Json数据</returns> public override PhaseResult Run(Context context) { Initialize(context); if (String.IsNullOrWhiteSpace(context.LastRequestContent)) { } String content = context.LastRequestContent; List<String> jsonResult = new List<String>(); //======================================================================= // 用正则表达式 jsonResult.AddRange(ParseByRegex(content)); //======================================================================= // 用XPath抓取 jsonResult.AddRange(ParseByXPath(content)); PhaseResult pr = new PhaseResult(this); pr.Succeed = true; pr.ListResult = jsonResult; if (Save) { context.JsonResult.AddRange(jsonResult); } if (!String.IsNullOrWhiteSpace(ListID)) { if (context.JsonResults.ContainsKey(ListID)) { context.JsonResults[ListID] = pr.ListResult; } else { context.JsonResults.Add(ListID, pr.ListResult); } } return pr; }
public override PhaseResult Run(Context context) { PhaseResult pr = new PhaseResult(this); GeckoWebBrowser browser = (GeckoWebBrowser)context.GetService(typeof(GeckoWebBrowser)); Debug.Assert(browser != null, "browser is null"); if (WaitMilliseconds > 0) { // 等待一定的毫秒数 Thread.Sleep(WaitMilliseconds); } else { String urlRegex = context.Resolve(UrlRegex); String contentRegex = context.Resolve(ContentRegex); // TODO::加上Timeout!!!返回结果只有为Timeout的时候才算失败 // 先等待Url的Pattern if (!String.IsNullOrWhiteSpace(urlRegex)) { while (browser.Document.Uri == null || !Regex.IsMatch(browser.Document.Uri, urlRegex)) { Thread.Sleep(200); } } // 再等待Content的Pattern if (!String.IsNullOrWhiteSpace(contentRegex)) { while (!Regex.IsMatch(RequestHelper.GetGeckoContent(browser), contentRegex)) { Thread.Sleep(200); } } } string content = RequestHelper.GetGeckoContent(browser); context.LastRequestContent = content ?? String.Empty; pr.SetString(Constant.RVHttpRequestResult, content); pr.Succeed = true; return pr; }
public PhaseResult RunAsIteration(Context context) { Initialize(context); Boundary bound = _step > 0 ? new Boundary(LessOrEqual) : new Boundary(GreaterOrEqual); List<String> jsonResult = new List<string>(); for (int i = _from; bound(i, _to); i += _step) { _updatableRequest.Update(context, i); PhaseResult pr = Request.Run(context); context.PushResult(pr); // 检查是否可以更新_to if (!_toPageInitialized) { Match match = Regex.Match(context.LastRequestContent, To); String pageStr = match.Groups[1].Value; Int32 page; if (Int32.TryParse(pageStr, out page)) { _to = page; _toPageInitialized = true; } } pr = Parse.Run(context); context.PushResult(pr); if (pr.ListResult != null) { jsonResult.AddRange(pr.ListResult); } } PhaseResult result = new PhaseResult(this); result.Succeed = true; result.ListResult = jsonResult; context.PushResult(result); return result; }
/// <summary> /// 先解析动态数据绑定,再解析静态数据绑定 /// </summary> /// <param name="raw"></param> /// <param name="context"></param> /// <returns></returns> public static String Resolve(String raw, Context context) { if (raw == Constant.ParameterPrefix) { return context.Account.UserName; } else if (raw == Constant.DOMElementPrefix) { return context.Account.Password; } raw = ResolveString(raw, Constant.ParameterPrefix, context.ParameterProvider); foreach (IDataProvider provider in context.RuntimeProviders) { raw = ResolveString(raw, Constant.RuntimePrefix, provider); } raw = ResolveString(raw, Constant.ParameterPrefix, context.ParameterProvider); return raw; }
public PhaseResult Run(Context context) { Boolean succ = true; foreach (IPhase phase in _logoutPhases) { PhaseResult pr = phase.Run(context); context.PushResult(pr); if (!pr.Succeed) { succ = false; break; } } PhaseResult result = new PhaseResult(this); result.Succeed = succ; return result; }
public PhaseResult RunAsNestedList(Context context) { PhaseResult list = MetaList.Run(context); PhaseResult pr = new PhaseResult(this); List<String> jsonResult = new List<string>(); if (list.ListResult != null) { GeckoWebBrowser browser = (GeckoWebBrowser)context.GetService(typeof(GeckoWebBrowser)); foreach (String item in list.ListResult) { Dictionary<String, String> itemDict = JsonConvert.DeserializeObject<Dictionary<String, String>>(item); if (itemDict.ContainsKey(MetaListUrlKey)) { String url = itemDict[MetaListUrlKey]; if (!url.StartsWith("http")) { url = String.Format("{0}/{1}", "http://club.autohome.com.cn", url.TrimStart('/')); } String content = String.Empty; if (browser != null) { content = RequestHelper.BrowserGet(browser, url); } else { content = RequestHelper.Get(url); } context.LastRequestContent = content; PhaseResult parseResult = Parse.Run(context); context.PushResult(parseResult); jsonResult.AddRange(parseResult.ListResult ?? new List<String>()); } } } pr.ListResult = jsonResult; pr.Succeed = true; return pr; }
public abstract PhaseResult Run(Context context);
/// <summary> /// /// </summary> /// <param name="context"></param> /// <returns>Succeed表示是否成功,ListResult表示抓到的Json数据</returns> public override PhaseResult Run(Context context) { return MetaList == null ? RunAsIteration(context) : RunAsNestedList(context); }
public PhaseResult Run(Context context) { throw new NotImplementedException(); }
public PhaseResult Run(Context context) { List<String> bind = null; if (!String.IsNullOrWhiteSpace(Binding)) { if (context.JsonResults.ContainsKey(Binding)) { bind = context.JsonResults[Binding]; } else { bind = new List<string>(); } } else { PhaseResult last = context.Stack.LastOrDefault(); if (last != null) { bind = last.ListResult; } } PhaseResult pr = new PhaseResult(this); pr.ListResult = new List<string>(); pr.Succeed = true; if (bind == null) { pr.Succeed = false; } else { // 将Json中PDF的Url下载并替换成文字 foreach (String json in bind) { Dictionary<String, String> dict = JsonConvert.DeserializeObject<Dictionary<String, String>>(json); if (dict.ContainsKey(FileUrlTagName)) { String url = dict[FileUrlTagName]; String content = null; switch (Type) { case DocumentType.PDF: content = PdfParser.Extract(url, Directory); if (String.IsNullOrWhiteSpace(content)) { content = PdfParser.Extract(url, Directory); } break; case DocumentType.WORD: content = WordParser.Extract(url, Directory); if (String.IsNullOrWhiteSpace(content)) { content = WordParser.Extract(url, Directory); } break; case DocumentType.EXCEL: content = ExcelParser.Extract(url, Directory); if (String.IsNullOrWhiteSpace(content)) { content = ExcelParser.Extract(url, Directory); } break; } dict[FileUrlTagName] = content.Replace(Environment.NewLine, String.Empty); pr.ListResult.Add(JsonConvert.SerializeObject(dict)); } } // 将结果保存 if (Save) { context.JsonResult.AddRange(pr.ListResult); } // 保存ListID等会可以在其他Phase中作为绑定对象 if (!String.IsNullOrWhiteSpace(ListID)) { if (context.JsonResults.ContainsKey(ListID)) { context.JsonResults[ListID] = pr.ListResult; } else { context.JsonResults.Add(ListID, pr.ListResult); } } } return pr; }
public PhaseResult Run(Context context) { //PhaseResult last = context.Stack.LastOrDefault(); List<String> bind = null; if (!String.IsNullOrWhiteSpace(Binding)) { if (context.JsonResults.ContainsKey(Binding)) { bind = context.JsonResults[Binding]; } else { bind = new List<string>(); } } else { PhaseResult last = context.Stack.LastOrDefault(); if (last != null) { bind = last.ListResult; } } PhaseResult pr = new PhaseResult(this); pr.ListResult = new List<string>(); if (bind != null) { foreach (String json in bind) { //IDataProvider provider = new BasicDataProvider(); // TODO:使用动态绑定来获取url // provider.SetString("url", json); try { IDataProvider provider = BasicDataProvider.CreateFromJson(json); context.RuntimeProviders.Push(provider); PhaseResult result = Request.Run(context); context.PushResult(result); result = Parse.Run(context); context.PushResult(result); if (result.ListResult != null) { pr.ListResult.AddRange(result.ListResult); } } finally { context.RuntimeProviders.Pop(); } } } else { PhaseResult result = Request.Run(context); context.PushResult(result); result = Parse.Run(context); context.PushResult(result); if (result.ListResult != null) { pr.ListResult.AddRange(result.ListResult); } } if (!String.IsNullOrWhiteSpace(ListID)) { if (context.JsonResults.ContainsKey(ListID)) { context.JsonResults[ListID] = pr.ListResult; } else { context.JsonResults.Add(ListID, pr.ListResult); } } return pr; }
private void Initialize(Context context) { _regex = context.Resolve(RegularExpression); _nested = context.Resolve(NestedRegularExpression); _baseXPath = context.Resolve(BaseXPath); if (XPaths != null && XPaths.Count > 0) { _xpaths = new Dictionary<string, string>(); foreach (var pair in XPaths) { _xpaths.Add(pair.Key, context.Resolve(pair.Value)); } } if (Images != null && Images.Count > 0) { _images = new Dictionary<String, DomElementLocator>(); foreach (var pair in Images) { pair.Value.Locator = context.Resolve(pair.Value.Locator); _images.Add(pair.Key, pair.Value); } } }
public override PhaseResult Run(Context context) { //PhaseResult last = context.Stack.LastOrDefault(); List<String> bind = null; if (!String.IsNullOrWhiteSpace(Binding)) { if (context.JsonResults.ContainsKey(Binding)) { bind = context.JsonResults[Binding]; } else { bind = new List<string>(); } } else { PhaseResult last = context.Stack.LastOrDefault(); if (last != null) { bind = last.ListResult; } } PhaseResult pr = new PhaseResult(this); pr.ListResult = new List<string>(); if (bind != null) { foreach (String json in bind) { try { IDataProvider provider = ParameterResolver.ParseProvider(json); context.RuntimeProviders.Push(provider); pr.ListResult.AddRange(GenerateOneBatch(context)); } finally { context.RuntimeProviders.Pop(); } } } else { pr.ListResult.AddRange(GenerateOneBatch(context)); } if (!String.IsNullOrWhiteSpace(ListID)) { if (context.JsonResults.ContainsKey(ListID)) { context.JsonResults[ListID] = pr.ListResult; } else { context.JsonResults.Add(ListID, pr.ListResult); } } return pr; }
private void Initialize(Context context) { Int32 to; if (Int32.TryParse(To, out to)) { _to = to; _toPageInitialized = true; } else { // To是一个正则表达式,这种情况下则需要在第一次请求的时候进行解析 // 如果解析失败,则程序运行一次后自动会停下并返回(_step != 0) _to = _from; } }