public DataHandlerBase(SiteRules rules, CartExtractor cart, ExtractorProgress progress, DataGroup group) { _cart = cart; _rules = rules; _progress = progress; _group = group; _exportDate = DateTime.Now; _exportDateInitialized = false; _migrationSlave = _rules.MigrationRules != null && _rules.MigrationRules.Enabled && !_rules.MigrationRules.IsMigrationMaster; Reset(); }
public SalesHandler(SiteRules rules, CartExtractor cart, ExtractorProgress progress) : base(rules, cart, progress, DataGroup.Sales) { }
private Dictionary<string, ParentItem> _parentProducts; //key is product id #endregion Fields #region Constructors public CatalogHandler(SiteRules rules, CartExtractor cart, ExtractorProgress progress) : base(rules, cart, progress, DataGroup.Catalog) { }
/// <summary> /// Fetch images from the site Catagories page based on the DisplayType /// </summary> /// <param name="catUrls"></param> /// <param name="imageCatalog"></param> /// <param name="itemCount"></param> /// <param name="progress"></param> internal static void LoadProductImagesForCategory(SiteRules rules, List<string> catUrls, ref Dictionary<string, string> imageCatalog, int itemCount, ExtractorProgress progress ) { var nodeSelector = rules.ProductNodeSelector; //Xpath to find product nodes if (string.IsNullOrEmpty(nodeSelector)) nodeSelector = "//div[@class='ProductImage QuickView']//img[contains(@src,'products')]"; //determine whether the categories have a tree or flat structure var displayType = rules.ForceCategoryTreeScrape ? CategoryDisplayType.Tree : CategoryDisplayType.Flat; var maxPages = 1; var uri = new Uri(String.Format("http://{0}/categories?sort=priceasc&page=1", rules.StoreShortUrl)); var thisWeb = new HtmlWeb(); var thisDoc = thisWeb.Load(uri.AbsoluteUri); var searchSpring = false; //while (!rules.ForceCategoryTreeScrape) //NOTE: could skip while loop in this case, but left out for testing while (true) //single-pass loop to set display type { //check for use of searchsprring --Ajax response is not seen by HtmlAgilityPack var matching = thisDoc.DocumentNode.SelectNodes("//div[@class='searchspring-results_container']"); //var matching = thisDoc.DocumentNode.SelectNodes("//div[@id='searchspring-main']"); if (matching != null) { searchSpring = true; matching = thisDoc.DocumentNode.SelectNodes("//div[@id='searchspring-main']"); if (matching == null) break; if (!matching[0].Attributes["class"].Value.Contains("searchspring-no_results")) //var noResults = thisDoc.DocumentNode.SelectNodes("//div[@class='searchspring-no_results']"); //if (noResults == null) break; displayType = CategoryDisplayType.Tree; break; } matching = thisDoc.DocumentNode.SelectNodes(nodeSelector); if (matching != null) break; var testSelector = "//div[starts-with(@class,'ProductImage')]"; //"//div[@class='ProductImage']//img[@src]"; matching = thisDoc.DocumentNode.SelectNodes(testSelector); if (matching != null) { nodeSelector = testSelector; break; } displayType = CategoryDisplayType.Tree; break; } var pageStatusFormat = progress.ExtraStatus + " --page {0} of {1} (max)"; if (displayType == CategoryDisplayType.Flat) { // we have a catalog setup with a cover page, so we need to resolve the Catalog first. //get page count if (searchSpring) { var totalPages = thisDoc.DocumentNode.SelectNodes("//div[@class='searchspring-total_pages']"); if (totalPages != null) maxPages = Input.SafeIntConvert(totalPages[0].InnerText); } else { //estimate max pages //var nodes = thisDoc.DocumentNode.SelectNodes("//div[@id='CategoryContent']//ul/li"); var nodes = thisDoc.DocumentNode.SelectNodes(nodeSelector); var imagesPerPage = nodes == null ? 1 : nodes.Count; maxPages = (int) Math.Ceiling((decimal) itemCount/(decimal) imagesPerPage); } //do // loop through pages incrementally until we dont have anythign to match for (var page = 1; page <= maxPages; page++) { try { //we already got the first page above, so process it and then grab the next var pageStatus = string.Format(pageStatusFormat, page, maxPages); progress.UpdateTask(imageCatalog.Count, -1, null, pageStatus); var imageNodes = thisDoc.DocumentNode.SelectNodes(nodeSelector); if (imageNodes == null) break; // not all products usually have images, so this could end before we run out of pages var imageUrl = ""; var pid = ""; foreach (var node in imageNodes) { imageUrl = GetUrlFromMatch(node, out pid, ref rules); if (string.IsNullOrEmpty(pid)) continue; if (!imageCatalog.ContainsKey(pid)) { imageCatalog.Add(pid, imageUrl); progress.UpdateTask(imageCatalog.Count); } } // we have a catalog setup with a cover page, so we need to resolve the Catalog first. uri = new Uri(String.Format("http://{0}/categories?sort=priceasc&page={1}", rules.StoreShortUrl, page + 1)); thisDoc = thisWeb.Load(uri.AbsoluteUri); } catch (Exception ex) { Debug.WriteLine(ex.Message); } } } else //Tree structure { try { var catCount = 0; var totalCats = catUrls.Count; foreach (var catUrl in catUrls) { var details = string.Format("Pulling images for {0} ({1} of {2} categories)", catUrl.Replace("/", ""), ++catCount, totalCats); progress.UpdateTask(imageCatalog.Count, -1, null, details); //get the first page of results var thisCatUrl = String.Format("http://{0}{1}?sort=priceasc", rules.StoreShortUrl, catUrl); var catUri = new Uri(thisCatUrl); try { thisDoc = thisWeb.Load(catUri.AbsoluteUri); } catch (Exception ex) { if (BoostLog.Instance != null) BoostLog.Instance.WriteEntry(EventLogEntryType.Information, "Crawler: Unable to Load " + catUri.AbsoluteUri, ex); continue; } //get page count maxPages = 0; if (searchSpring) { var totalPages = thisDoc.DocumentNode.SelectNodes("//div[@class='searchspring-total_pages']"); if (totalPages != null) maxPages = Input.SafeIntConvert(totalPages[0].InnerText); } if (maxPages < 1) { //TODO: See if there is a better way to calculate pages. maxPages = itemCount/10; } var page = 1; do { var matching = thisDoc.DocumentNode.SelectNodes(nodeSelector); //var matching = thisDoc.DocumentNode.SelectNodes("//div[starts-with(@class,'ProductImage')]//img[@src]"); //if (matching == null) // matching = thisDoc.DocumentNode.SelectNodes("//div[@class='ProductImage[QuickView]?')]//img[@src]"); //if (matching == null) // matching = thisDoc.DocumentNode.SelectNodes("//div[@class='ProductImage']//img[@src]"); //if (matching == null) // matching = thisDoc.DocumentNode.SelectNodes("//div[@class='ProductImage QuickView']//img[@src]"); if (!string.IsNullOrEmpty(rules.CommentParseKey)) //sometimes the nodes we want are hidden in a comment { var commentNodes = thisDoc.DocumentNode.SelectNodes(string.Format("//comment()[contains(., {0})]", rules.CommentParseKey)); if (commentNodes != null) { foreach (var c in commentNodes) { try { var comment = new HtmlDocument(); comment.LoadHtml(c.InnerHtml.Replace("<!--", "").Replace("-->", "")); var partialMatch = comment.DocumentNode.SelectNodes(nodeSelector); if (partialMatch != null) if (matching == null) matching = partialMatch; else foreach (var match in partialMatch) matching.Add(match); } catch (Exception ex) { if (BoostLog.Instance != null) BoostLog.Instance.WriteEntry(EventLogEntryType.Information, "Crawler: Unable to parse comment node", ex); } } } } if (matching == null) break; var matchCount = matching.Count; var oldCount = imageCatalog.Count; var imageUrl = ""; var pid = ""; foreach (var node in matching) { imageUrl = GetUrlFromMatch(node, out pid, ref rules); if (string.IsNullOrEmpty(pid)) continue; if (!imageCatalog.ContainsKey(pid)) { imageCatalog.Add(pid, imageUrl); progress.UpdateTask(imageCatalog.Count); } } if (imageCatalog.Count == oldCount) break; //no new images found var pageStatus = string.Format(pageStatusFormat, page, maxPages); progress.UpdateTask(imageCatalog.Count, -1, null, details + pageStatus); if (++page > maxPages) break; thisCatUrl = String.Format("http://{0}{1}?sort=priceasc&page={2}", rules.StoreShortUrl, catUrl, page); catUri = new Uri(thisCatUrl); thisDoc = thisWeb.Load(catUri.AbsoluteUri); } while (true); } } catch (Exception ex) { Debug.WriteLine(ex.Message); } } }
public AttributeHandler(SiteRules rules, CartExtractor cart, ExtractorProgress progress, DataGroup group) : base(rules, cart, progress, group) { }
protected CartExtractor(SiteRules rules) { if (rules == null) throw new NoNullAllowedException("CartExtractor Rules cannot be null"); Rules = rules; Alias = Rules.Alias; DataReadPath = IO.DataPath.Instance.ClientDataPath(ref Alias, true); DataWritePath = IO.DataPath.Instance.ClientDataPath(Alias, true); Progress = new ExtractorProgress(); IsExtractorQueued = false; IsExtracting = false; #if !CART_EXTRACTOR_TEST_SITE BoostService = RestAccess.Instance; FeaturedCrossSells = new FeaturedRecommendations(); FeaturedUpSells = new FeaturedRecommendations(); #endif ExclusionStats = new Dictionary<string, int>(); Catalog = new CatalogHandler(Rules, this, Progress); Inventory = new InventoryHandler(Rules, this, Progress); Sales = new SalesHandler(Rules, this, Progress); Customers = new CustomerHandler(Rules, this, Progress); CategoryNames = new AttributeHandler(Rules, this, Progress, DataGroup.CategoryNames); BrandNames = new AttributeHandler(Rules, this, Progress, DataGroup.ManufacturerNames); DepartmentNames = new AttributeHandler(Rules, this, Progress, DataGroup.DepartmentNames); //TODO: Move all below to the CatalogHandler class or depricate Exclusions = new List<ExclusionRecord>(); Replacements = new List<ReplacementRecord>(); ParentProducts = new Dictionary<string, ParentItem>(); AltPrices = new Dictionary<string, List<string>>(); AltPageLinks = new Dictionary<string, List<string>>(); AltImageLinks = new Dictionary<string, List<string>>(); AltTitles = new Dictionary<string, List<string>>(); Departments = new Dictionary<string, string>(); ExclusionCauses = new Dictionary<string, string>(); }
public InventoryHandler(SiteRules rules, CartExtractor cart, ExtractorProgress progress) : base(rules, cart, progress, DataGroup.Inventory) { }
public void SetMigrationProgress(ExtractorProgress mp) { _migrationProgress = mp; //TODO:confirm that this is a reference and not a copy }
public void BeginMapping(ExtractorProgress progress) { Progress = progress; if (Enabled) MigrationSubMap = new Dictionary<string, string>(); }
/// <summary> /// Recursive method to read from the feed client. /// Method will retry on timeout if ApiMaxTries is greater than 1 /// </summary> /// <param name="feedUrl"></param> /// <param name="tryCount"></param> /// <returns></returns> public Stream TryOpenRead(string feedUrl, ref ExtractorProgress progress, int tryCount = 1) { Stream resultStream = null; var details = ""; try { InitServicePointManager(); resultStream = OpenRead(feedUrl); } catch (TimeoutException tex) { if (resultStream != null) resultStream.Close(); resultStream = null; details = tex.Message; if (tryCount >= _config.MaxTries) throw new Exception(details); } catch (WebException wex) { if (resultStream != null) resultStream.Close(); resultStream = null; details = wex.Message; if (wex.Status != WebExceptionStatus.Timeout || tryCount >= _config.MaxTries) throw new Exception(details); } #if DEBUG catch (Exception ex) { if (resultStream != null) resultStream.Close(); resultStream = null; details = string.Format("Exception in TryOpenRead (trycount = {0}): {1}", tryCount, ex.Message); throw ex; } finally { if (BoostLog.Instance != null) BoostLog.Instance.WriteEntry(EventLogEntryType.Error, details, feedUrl); } #endif if (resultStream == null && tryCount < _config.MaxTries) { tryCount++; var msg = string.Format("Timeout. Retry {0} of {1}", tryCount, _config.MaxTries); progress.UpdateTable(-1, -1, msg); #if DEBUG if (BoostLog.Instance != null) BoostLog.Instance.WriteEntry(EventLogEntryType.Warning, msg, feedUrl); #endif Thread.Sleep(_config.RetryDelay); return TryOpenRead(feedUrl, ref progress, tryCount); } return resultStream; }
public CustomerHandler(SiteRules rules, CartExtractor cart, ExtractorProgress progress) : base(rules, cart, progress, DataGroup.Customers) { }