private void DoMoreCategory(Task task, string html, ContentProcessResult result) { HtmlNode root = GetRoot(html); if (root == null || html.Contains("td.refinementContainer")) { TaskFail(task, result); return; } HtmlNodeCollection navs = root.SelectNodes(".//div[@id='refinements']//ul[@data-typeid='n']/li/a/span[@class='refinementLink']"); if (navs == null) { //no more refinements DoPages(task, html, result, root); } else { foreach (HtmlNode nav in navs) { HtmlNode link = nav.ParentNode; if (link != null) { string path = HAH.SafeGetAttributeStringValue(link, "href"); if (path != null) { result.NewTasks.Add(MakeTask(path, AmazonTaskType.MORE_CATEGORY, null)); } } } } }
private void HandleProviderList(TaskProcess tp) { string html = Encoding.Default.GetString(tp.TaskData.Bytes); if (!Validate(html)) { tp.CpResult.Success = false; return; } HtmlNode root = GetRoot(html); if (root == null) { FailProcess(tp); return; } HtmlNodeCollection list = root.SelectNodes(@"//div[@id='list-content']//li[@class='list-item']"); if (list == null) { FailProcess(tp); return; } bool success = true; foreach (HtmlNode node in list) { success &= HandleProviderListNode(tp, node); if (!success) { break; } } //next page string nextPageURL = HAH.SafeGetSuccessorAttributeStringValue(root, @".//a[@class='page-next']", "href"); if (nextPageURL != null) { tp.CpResult.NewTasks.Add(new Task { Url = FixRelativeURL(nextPageURL, tp.TaskData.Task.Host), Type = (int)TaobaoTaskType.PROVIDER_LIST, Context = tp.TaskData.Task.Context }); } tp.CpResult.Success = success; }
private void DoPages(Task task, string html, ContentProcessResult result, HtmlNode root) { if (root == null) { root = GetRoot(html); if (root == null) { TaskFail(task, result); return; } } HtmlNodeCollection shops = root.SelectNodes(".//div[@id='rightResultsATF']//a[@class='title']"); HtmlNode contextNode = root.SelectSingleNode("id('breadCrumb')"); if (contextNode == null) { TaskFail(task, result); return; } string context = HttpUtility.HtmlDecode(contextNode.InnerText); context = ReplaceSpace.Replace(context, ""); context = context.Replace('\\', ','); context = context.Replace('/', ','); context = context.Replace('›', '\\'); if (shops != null) { foreach (HtmlNode node in shops) { string path = HAH.SafeGetAttributeStringValue(node, "href"); result.NewTasks.Add(MakeTask(path, AmazonTaskType.PAGE, context)); } } HtmlNode nextPageLink = root.SelectSingleNode(".//a[@id='pagnNextLink']"); if (nextPageLink != null) { string path = HAH.SafeGetAttributeStringValue(nextPageLink, "href"); if (path != null) { result.NewTasks.Add(MakeTask(path, AmazonTaskType.PAGES, null)); } } }
private void DoIndex(Task task, string html, ContentProcessResult result) { HtmlNode root = GetRoot(html); if (root == null) { TaskFail(task, result); return; } HtmlNodeCollection links = root.SelectNodes(".//div[@id='siteDirectory']//td//a"); if (links == null) { return; } foreach (HtmlNode node in links) { string path = HAH.SafeGetAttributeStringValue(node, "href"); if (path != null) { Match m = GetNodeID.Match(path); if (m.Success) { string sid = m.Groups["id"].Value; int id; if (Int32.TryParse(sid, out id)) { //http://www.amazon.cn/gp/search/ref=sr_hi_1?rh=n%3A658390051&ie=UTF8 Task t = MakeTask("http://www.amazon.cn/gp/search/ref=sr_hi_1?rh=n%3A" + id + "&ie=UTF8", AmazonTaskType.MORE_CATEGORY, null); result.NewTasks.Add(t); } } } } }
private bool HandleCombinedListMultiShops(TaskProcess tp, HtmlNode node) { string link = HAH.SafeGetSuccessorAttributeStringValue(node, @".//div[@class='legend2']/a", "href"); if (link == null) { LogMissing("URL", node.InnerHtml); return(false); } Match m = RegexUniqID.Match(link); if (m.Success) { string uniqIDs = m.Groups["number"].Value; tp.CpResult.NewTasks.Add(new Task { Type = (int)TaobaoTaskType.PROVIDER_LIST, Url = FixRelativeURL(link, tp.TaskData.Task.Host), Context = uniqIDs, }); } return(true); }
private bool HandleCombinedListSingleShop(TaskProcess tp, HtmlNode node) { #region build an item Item i = new Item(); string freight = HAH.SafeGetSuccessorInnerText(node, @".//li[@class='shipment']/span[@class='fee']"); if (freight != null && freight.Length > 3) { //运费:8.00 string freightD = freight.Substring(3); double d; if (double.TryParse(freightD, out d)) { i.Freight = d; } else { LogMissing("freight", freightD); return(false); } } else { LogMissing("freight", freight); return(false); } i.Name = HAH.SafeGetSuccessorAttributeStringValue(node, @".//a[@class='EventCanSelect']", "title"); if (i.Name == null) { LogMissing("Name", node.InnerHtml); return(false); } i.Location = HAH.SafeGetSuccessorInnerText(node, @".//li[@class='shipment']/span[@class='loc']"); if (i.Location == null) { LogMissing("Location", node.InnerHtml); return(false); } string price = HAH.SafeGetSuccessorInnerText(node, @".//li[@class='price']/em"); if (!string.IsNullOrEmpty(price)) { //359.00 double d; if (double.TryParse(price, out d)) { i.Price = d; } else { LogMissing("price", price); return(false); } } else { LogMissing("price", price); return(false); } i.RecentDeal = 0; string recentDeal = HAH.SafeGetSuccessorInnerText(node, @".//li[@class='price']/span"); if (!string.IsNullOrEmpty(recentDeal)) { Match m = RegexRecentSellCount.Match(recentDeal); if (m.Success) { i.RecentDeal = Int32.Parse(m.Groups["number"].Value); } } string sellerID = HAH.SafeGetSuccessorAttributeStringValue(node, @".//li[@class='seller']/a", "href"); if (!string.IsNullOrEmpty(sellerID)) { Match m = RegexSellerID.Match(sellerID); if (m.Success) { i.SellerTaobaoId = Int32.Parse(m.Groups["number"].Value); } else { LogMissing("SellerID", sellerID); return(false); } } else { LogMissing("SellerID", node.InnerHtml); return(false); } i.UniqId = 0; i.UrlLink = HAH.SafeGetSuccessorAttributeStringValue(node, ".//a[@class='EventCanSelect']", "href"); i.UrlLink = FixRelativeURL(i.UrlLink, tp.TaskData.Task.Host); if (!string.IsNullOrEmpty(i.UrlLink)) { string taobaoID; Match m = RegexItemTaobaoID.Match(i.UrlLink); if (m.Success) { taobaoID = m.Groups["number"].Value; i.TaobaoId = long.Parse(taobaoID); } } OpsItem.Upsert(i); #endregion #region build new task //Seller tp.CpResult.NewTasks.Add(new Task { Url = RateURL.Replace("#UID#", i.SellerTaobaoId.ToString()), Type = (int)TaobaoTaskType.PROVIDER_RATE, Context = i.SellerTaobaoId.ToString() }); #endregion return(true); }
private void HandleCombinedList(TaskProcess tp) { string html = Encoding.Default.GetString(tp.TaskData.Bytes); if (!Validate(html)) { tp.CpResult.Success = false; return; } HtmlNode root = GetRoot(html); if (root == null) { FailProcess(tp); return; } HtmlNodeCollection list = root.SelectNodes(@"//div[@id='list-content']//li[@class='list-item']"); if (list == null) { FailProcess(tp); return; } bool success = true; foreach (HtmlNode node in list) { //only 1 shop? string count = HAH.SafeGetSuccessorInnerText(node, @".//div[@class='legend2']/a"); if (string.IsNullOrEmpty(count)) { LogMissing("count", count); } else { Match m = RegexShopCount.Match(count); if (m.Success) { int c = Int32.Parse(m.Groups["number"].Value); if (c == 1) { //single shop success &= HandleCombinedListSingleShop(tp, node); } else { success &= HandleCombinedListMultiShops(tp, node); } } else { LogMissing("count", count); success = false; } } if (!success) { break; } } //next page string nextPageURL = HAH.SafeGetSuccessorAttributeStringValue(root, @".//a[@class='page-next']", "href"); if (nextPageURL != null) { tp.CpResult.NewTasks.Add(new Task { Url = FixRelativeURL(nextPageURL, tp.TaskData.Task.Host), Type = (int)TaobaoTaskType.COMBINED_LIST, }); } tp.CpResult.Success = success; }
private void HandleProviderRate(TaskProcess tp) { string html = Encoding.Default.GetString(tp.TaskData.Bytes); if (!Validate(html)) { tp.CpResult.Success = false; return; } HtmlNode root = GetRoot(html); if (root == null) { FailProcess(tp); return; } Seller s = new Seller(); if (tp.TaskData.Task.Context == null) { LogMissing("TaobaoID", tp.TaskData.Task.Context); tp.CpResult.Success = false; return; } s.TaobaoId = Int32.Parse(tp.TaskData.Task.Context); s.IsTmall = root.SelectSingleNode(@".//div[@class='tmall-pro']") != null; if (!s.IsTmall) { string credit = HAH.SafeGetSuccessorInnerText(root, @".//ul[contains(@class,'sep')]/li"); if (string.IsNullOrEmpty(credit)) { LogMissing("credit", credit); tp.CpResult.Success = false; return; } credit = RegexEmpty.Replace(credit, ""); //卖家信用:5 int crediti; if (credit.Length > 5 && int.TryParse(credit.Substring(5), out crediti)) { s.Credit = crediti; } else { LogMissing("Credit", credit.Substring(5)); } string goodRate = HAH.SafeGetSuccessorInnerText(root, @".//div[@id='seller-rate']//em"); //好评率:98.30% if (!string.IsNullOrEmpty(goodRate) && goodRate.IndexOf('%') >= 4) { double goodrated; if (double.TryParse(goodRate.Substring(4, goodRate.IndexOf("%") - 4), out goodrated)) { s.Goodrate = goodrated; } else { LogMissing("GoodRate", goodRate); } } else { LogMissing("GoodRate", goodRate); } } // // HtmlNodeCollection infos = root.SelectNodes(@".//div[@class='bd']/ul/li"); // foreach (HtmlNode node in infos) // { // Match m = RegexCreateTime.Match(node.InnerText); // if (m.Success) // { // string date = m.Groups["time"].Value; // s.StartTime = DateTime.Parse(date); // } // } //半年动态评分 { HtmlNodeCollection nodes = root.SelectNodes(@".//div[@id='sixmonth']//div[@class='item-scrib']"); if (nodes != null) { foreach (var node in nodes) { string title = HAH.SafeGetSuccessorInnerText(node, @"./span[@class='title']"); string count = HAH.SafeGetSuccessorInnerText(node, @"./em[@class='count']"); HtmlNode percent = node.SelectSingleNode(@".//strong[contains(@class,'percent')]"); if (percent == null) { LogMissing(title, "Percent"); continue; } double c; if (!double.TryParse(count, out c)) { LogMissing(title, count); continue; } double d; string rate = percent.InnerText.Replace("%", ""); if (rate == "----") { d = 0; } else if (!double.TryParse(rate, out d)) { LogMissing(title, rate); continue; } string percentClass = percent.Attributes["class"].Value; if (percentClass.Contains("lower")) { d *= -1.0; } if (title == "宝贝与描述相符:") { s.Rmatch = c; s.Pmatch = d; } else if (title == "卖家的服务态度:") { s.Rservice = c; s.Pservice = d; } else if (title == "卖家发货的速度:") { s.Rspeed = c; s.Pspeed = d; } } } } //保障 string text = HAH.SafeGetSuccessorInnerHtml(root, @".//div[@class='desc' or @class='promise']"); if (!string.IsNullOrEmpty(text)) { s.Pprotect = text.Contains("消费者保障"); s.Psevendays = text.Contains("7天无理由退换货") || text.Contains("七天退换"); s.Preal = text.Contains("正品保障"); s.Pinvoice = text.Contains("提供发票"); } else { s.Pprotect = false; s.Psevendays = false; s.Preal = false; s.Pinvoice = false; } //30天服务 /* * { * HtmlNodeCollection nodes = root.SelectNodes(@".//div[@class='each']"); * foreach (var node in nodes) * { * HtmlNodeCollection innerNodes = node.SelectNodes(@"./span"); * if (innerNodes.Count == 4) * { * string title = innerNodes[0].InnerText; * if (title.Contains("平均退款速度")) * { * s.Refunddays = Double.Parse(innerNodes[1].InnerText.Replace("%", "")); * } * else if (title.Contains("近30天退款率")) * { * s.Refundrate = Double.Parse(innerNodes[1].InnerText.Replace("%", "")); * } * else if (title.Contains("近30天投诉率")) * { * s.Complaint = Double.Parse(innerNodes[1].InnerText.Replace("%", "")); * } * else if (title.Contains("近30天处罚数")) * { * //0 次 * s.Penalty = Int32.Parse(innerNodes[1].InnerText.Replace(" 次", "")); * } * } * } * }*/ OpsSeller.Upsert(s); }