private void DoMoreCategory(Task task, string html, ContentProcessResult result) { HtmlNode root = GetRoot(html); if (root == null || html.Contains("td.refinementContainer")) { TaskFail(task, result); return; } HtmlNodeCollection navs = root.SelectNodes(".//div[@id='refinements']//ul[@data-typeid='n']/li/a/span[@class='refinementLink']"); if (navs == null) { //no more refinements DoPages(task, html, result, root); } else { foreach (HtmlNode nav in navs) { HtmlNode link = nav.ParentNode; if (link != null) { string path = HAH.SafeGetAttributeStringValue(link, "href"); if (path != null) { result.NewTasks.Add(MakeTask(path, AmazonTaskType.MORE_CATEGORY, null)); } } } } }
protected override ContentProcessResult Process(TaskData td) { ContentProcessResult cpr = new ContentProcessResult(); TaskProcess tp = new TaskProcess(); tp.TaskData = td; tp.CpResult = cpr; switch ((TaobaoTaskType)td.Task.Type) { case TaobaoTaskType.COMBINED_LIST: HandleCombinedList(tp); break; case TaobaoTaskType.PROVIDER_LIST: HandleProviderList(tp); break; case TaobaoTaskType.PROVIDER_RATE: HandleProviderRate(tp); break; default: throw new ArgumentOutOfRangeException(); } return(cpr); }
protected override ContentProcessResult Process(TaskData td) { Task task = td.Task; byte[] bytes = td.Bytes; AmazonTaskType att = (AmazonTaskType)task.Type; ContentProcessResult result = new ContentProcessResult(); //set to true, but could fail result.Success = true; if (att == AmazonTaskType.IMAGE //|| att == AmazonTaskType.IMAGE_ORI ) { //DoImage(); } else { string html = GetString(bytes, null); if (!Validate(html)) { result.Success = false; return(result); } switch (att) { case AmazonTaskType.INDEX: DoIndex(task, html, result); break; case AmazonTaskType.CATEGORY: // DoCategory(task, html, result); break; case AmazonTaskType.MORE_CATEGORY: DoMoreCategory(task, html, result); break; case AmazonTaskType.PAGES: DoPages(task, html, result, null); break; case AmazonTaskType.PAGE: DoPage(task, html, result); break; } } return(result); }
private void DoPages(Task task, string html, ContentProcessResult result, HtmlNode root) { if (root == null) { root = GetRoot(html); if (root == null) { TaskFail(task, result); return; } } HtmlNodeCollection shops = root.SelectNodes(".//div[@id='rightResultsATF']//a[@class='title']"); HtmlNode contextNode = root.SelectSingleNode("id('breadCrumb')"); if (contextNode == null) { TaskFail(task, result); return; } string context = HttpUtility.HtmlDecode(contextNode.InnerText); context = ReplaceSpace.Replace(context, ""); context = context.Replace('\\', ','); context = context.Replace('/', ','); context = context.Replace('›', '\\'); if (shops != null) { foreach (HtmlNode node in shops) { string path = HAH.SafeGetAttributeStringValue(node, "href"); result.NewTasks.Add(MakeTask(path, AmazonTaskType.PAGE, context)); } } HtmlNode nextPageLink = root.SelectSingleNode(".//a[@id='pagnNextLink']"); if (nextPageLink != null) { string path = HAH.SafeGetAttributeStringValue(nextPageLink, "href"); if (path != null) { result.NewTasks.Add(MakeTask(path, AmazonTaskType.PAGES, null)); } } }
private void DoIndex(Task task, string html, ContentProcessResult result) { HtmlNode root = GetRoot(html); if (root == null) { TaskFail(task, result); return; } HtmlNodeCollection links = root.SelectNodes(".//div[@id='siteDirectory']//td//a"); if (links == null) { return; } foreach (HtmlNode node in links) { string path = HAH.SafeGetAttributeStringValue(node, "href"); if (path != null) { Match m = GetNodeID.Match(path); if (m.Success) { string sid = m.Groups["id"].Value; int id; if (Int32.TryParse(sid, out id)) { //http://www.amazon.cn/gp/search/ref=sr_hi_1?rh=n%3A658390051&ie=UTF8 Task t = MakeTask("http://www.amazon.cn/gp/search/ref=sr_hi_1?rh=n%3A" + id + "&ie=UTF8", AmazonTaskType.MORE_CATEGORY, null); result.NewTasks.Add(t); } } } } }
private void TaskFail(Task task, ContentProcessResult result) { result.Success = false; result.NewTasks.Clear(); }
private void DoPage(Task task, string html, ContentProcessResult result) { // HtmlNode root = GetRoot(html); // if (root == null) // { // TaskFail(task,result); // return; // } MatchCollection mc; if ((mc = PicPathRegex.Matches(html)).Count != 0) { foreach (Match m in mc) { string thumbPath = m.Groups["thumb"].Value; string zoomPath = m.Groups["zoom"].Value; if (!string.IsNullOrEmpty(thumbPath)) { result.NewTasks.Add(MakeTask(thumbPath, AmazonTaskType.IMAGE, task.Context)); } // if (!string.IsNullOrEmpty(zoomPath)) // { // result.NewTasks.Add(MakeTask(thumbPath, AmazonTaskType.IMAGE_ORI, task.Context)); // } } } else if ((mc = PicPathRegex2.Matches(html)).Count != 0) { foreach (Match m in mc) { string thumbPath = m.Value; if (!string.IsNullOrEmpty(thumbPath)) { result.NewTasks.Add(MakeTask(thumbPath, AmazonTaskType.IMAGE, task.Context)); } } } else if ((mc = PicPathRegex3.Matches(html)).Count != 0) { foreach (Match m in mc) { string thumbPath = m.Groups["img"].Value; if (!string.IsNullOrEmpty(thumbPath)) { result.NewTasks.Add(MakeTask(thumbPath, AmazonTaskType.IMAGE, task.Context)); } } } else if ((mc = PicPathRegex4.Matches(html)).Count != 0) { foreach (Match m in mc) { string picPath = m.Groups["pic"].Value; if (!string.IsNullOrEmpty(picPath)) { // Console.WriteLine(picPath); result.NewTasks.Add(MakeTask(picPath, AmazonTaskType.IMAGE, task.Context)); } } } else { Log("{0} has not picture", task.Url); TaskFail(task, result); } }