/// <summary> /// Extract the URL from a mached HtmlNode and add it to the catalog (if not already there) /// </summary> /// <param name="node"></param> /// <param name="imageCatalog"></param> /// <returns></returns> internal static string GetUrlFromMatch(HtmlNode node, out string pid, ref SiteRules rules) { pid = null; //stamdard link looks like this //http://cdn2.bigcommerce.com/server3300/46048/products/10676/images/91127/Hunter_Original_Tall_Boot_Aubergine_4__95507.1343675413.185.279.jpg var imageUrlSelector = rules.ImageUrlSelector; var imageUrlPrefix = rules.ImageUrlPrefix; if (string.IsNullOrEmpty(imageUrlPrefix)) imageUrlPrefix = "src=\""; var imageUrlSuffix = rules.ImageUrlSuffix; if (string.IsNullOrEmpty(imageUrlSuffix)) imageUrlSuffix = "\""; var pidSelector = rules.PidSelector; var pidPrefix = rules.PidPrefix; if (string.IsNullOrEmpty(pidPrefix)) pidPrefix = "/products/"; var pidSuffix = rules.PidSuffix; if (string.IsNullOrEmpty(pidSuffix)) pidSuffix = "/"; //find the image url HtmlNode imageNode; if (string.IsNullOrEmpty(imageUrlSelector)) imageNode = node; else { var nodeList = node.SelectNodes(imageUrlSelector); imageNode = nodeList == null ? node : nodeList.FirstOrDefault(); } if (imageNode == null) return ""; var html = imageNode.OuterHtml; var src = html.IndexOf(imageUrlPrefix); if (src < 0) return ""; src += imageUrlPrefix.Length; var start = html.IndexOf("//", src); //skip protocol if (start < 0) start = src; var end = html.IndexOf(imageUrlSuffix, start); var url = end < 0 ? html.Substring(start) : html.Substring(start, end - start); //find the product id HtmlNode pidNode; if (string.IsNullOrEmpty(pidSelector)) pidNode = node; else { var nodeList = node.SelectNodes(pidSelector); pidNode = nodeList == null ? node : nodeList.FirstOrDefault(); } if (pidNode == null) return url; html = pidNode.OuterHtml; start = html.IndexOf(pidPrefix); if (start < 0) return url; start += pidPrefix.Length; end = html.IndexOf(pidSuffix, start); pid = end < 0 ? html.Substring(start) : html.Substring(start, end - start); return url; }
public DataHandlerBase(SiteRules rules, CartExtractor cart, ExtractorProgress progress, DataGroup group) { _cart = cart; _rules = rules; _progress = progress; _group = group; _exportDate = DateTime.Now; _exportDateInitialized = false; _migrationSlave = _rules.MigrationRules != null && _rules.MigrationRules.Enabled && !_rules.MigrationRules.IsMigrationMaster; Reset(); }
public XmlFeedExtractor(SiteRules rules) : base(rules) { //determine feed type SetFeedTypes(); //if api key is not provided, assume license key instead if (string.IsNullOrEmpty(Rules.ApiKey)) { Rules.ApiKey = ClientData.Instance.GetServiceKey(Alias); } }
public ShopifyExtractor(SiteRules rules) : base(rules) { //TODO: Implement ValidateCredentials for all carts and move this check to the base class var status = ""; HasCredentials = ValidateCredentials(out status); if (!HasCredentials && BoostLog.Instance != null) BoostLog.Instance.WriteEntry(EventLogEntryType.Error, "Invalid Shopify API Credentials", status, Alias); else // determine feed types SetFeedTypes(); }
public void SaveSiteRules(string alias, XElement settings, SiteRules rules = null, WebContextProxy wc = null) { try { #if DEBUG if (BoostLog.Instance != null && wc != null) BoostLog.Instance.WriteEntry(EventLogEntryType.Information, string.Format("Saving Rules for {0}", alias), "Context:\n" + wc.ToString(), alias); #endif var path = DataPath.Instance.ClientDataPath(alias, true); if (!Directory.Exists(path)) Directory.CreateDirectory(path); settings.Save(path + SiteRulesFileName); //check to see if we should update BoostConfigOverride (used when customers upload data directly to us) if (rules == null || rules.CartExtractorExists || TableAccess.Instance == null) return; var data = rules.FormatGeneratorConfig(); TableAccess.Instance.WriteTable(alias, "ConfigBoostOverride.txt", data); } catch { } }
public bool SetRules(SiteRules rules) { if (!rules.Alias.Equals(Rules.Alias)) return false; //can't change the alias Rules = rules; //FillDefaultFieldNames(); //Rules.InitFieldNames(); return true; }
protected CartExtractor(SiteRules rules) { if (rules == null) throw new NoNullAllowedException("CartExtractor Rules cannot be null"); Rules = rules; Alias = Rules.Alias; DataReadPath = IO.DataPath.Instance.ClientDataPath(ref Alias, true); DataWritePath = IO.DataPath.Instance.ClientDataPath(Alias, true); Progress = new ExtractorProgress(); IsExtractorQueued = false; IsExtracting = false; #if !CART_EXTRACTOR_TEST_SITE BoostService = RestAccess.Instance; FeaturedCrossSells = new FeaturedRecommendations(); FeaturedUpSells = new FeaturedRecommendations(); #endif ExclusionStats = new Dictionary<string, int>(); Catalog = new CatalogHandler(Rules, this, Progress); Inventory = new InventoryHandler(Rules, this, Progress); Sales = new SalesHandler(Rules, this, Progress); Customers = new CustomerHandler(Rules, this, Progress); CategoryNames = new AttributeHandler(Rules, this, Progress, DataGroup.CategoryNames); BrandNames = new AttributeHandler(Rules, this, Progress, DataGroup.ManufacturerNames); DepartmentNames = new AttributeHandler(Rules, this, Progress, DataGroup.DepartmentNames); //TODO: Move all below to the CatalogHandler class or depricate Exclusions = new List<ExclusionRecord>(); Replacements = new List<ReplacementRecord>(); ParentProducts = new Dictionary<string, ParentItem>(); AltPrices = new Dictionary<string, List<string>>(); AltPageLinks = new Dictionary<string, List<string>>(); AltImageLinks = new Dictionary<string, List<string>>(); AltTitles = new Dictionary<string, List<string>>(); Departments = new Dictionary<string, string>(); ExclusionCauses = new Dictionary<string, string>(); }
public Categories(SiteRules Rules) { d = new Dictionary<string, string>(); this.Rules = Rules; }
public SalesHandler(SiteRules rules, CartExtractor cart, ExtractorProgress progress) : base(rules, cart, progress, DataGroup.Sales) { }
public AttributeHandler(SiteRules rules, CartExtractor cart, ExtractorProgress progress, DataGroup group) : base(rules, cart, progress, group) { }
private CartExtractor GetCart() { string alias = TextBoxClientAlias.Text; XElement settings = ReadSiteRules(alias, "upload"); if (settings == null) { TextBoxResults.Text = "Error Reading Site Rules\n"; UpdatePanelResults.Update(); return null; } SiteRules rules = new SiteRules(alias, 1, settings); return CartExtractor.GetCart(rules); }
public CustomerHandler(SiteRules rules, CartExtractor cart, ExtractorProgress progress) : base(rules, cart, progress, DataGroup.Customers) { }
public Catalog(SiteRules Rules, bool inventoryOnly) { this.Rules = Rules; this.inventoryOnly = inventoryOnly; }
public Sales(SiteRules Rules) { List<string> header = new List<string>(); header.Add(Rules.Fields.GetName(FieldName.OrderId)); header.Add(Rules.Fields.GetName(FieldName.OrderProductId)); header.Add(Rules.Fields.GetName(FieldName.OrderCustomerId)); header.Add(Rules.Fields.GetName(FieldName.OrderQuantity)); header.Add(Rules.Fields.GetName(FieldName.OrderDate)); output.Add(header); }
public Customers(SiteRules Rules) { List<string> header = new List<string>(); header.Add(Rules.Fields.GetName(FieldName.CustomerId)); header.Add("Email"); header.Add("Name"); header.Add("Address"); header.Add("City"); header.Add("State"); header.Add("PostalCode"); header.Add("Country"); header.Add("Phone"); header.Add("Gender"); header.Add("Birthday"); header.Add("AgeRange"); header.Add("AlternativeIDs"); header.Add("DoNotTrack"); output.Add(header); }
public static CartExtractor GetCart(SiteRules rules) { CartExtractor cart = null; switch (rules.CartType) { #if !CART_EXTRACTOR_TEST_SITE case CartType.ThreeDCart: cart = new ThreeDCartExtractor(rules); break; #endif case CartType.BigCommerce: cart = new BigCommerceExtractor(rules); break; case CartType.MivaMerchant: cart = new MivaMerchantExtractor(rules); break; case CartType.Volusion: cart = new VolusionExtractor(rules); break; case CartType.CommerceV3: cart = new CommerceV3Extractor(rules); break; case CartType.Magento: if (rules.PluginVersion > 3) goto case CartType.JsonFeed; break; case CartType.Shopify: cart = new ShopifyExtractor(rules); break; //case CartType.NetSuite: // cart = new NetSuiteExtractor(rules); // break; case CartType.AspDotNetStorefront: case CartType.WebsitePipeline: case CartType.TabbedFeed: case CartType.JsonFeed: cart = new JsonFeedExtractor(rules); break; case CartType.XmlFeed: cart = new XmlFeedExtractor(rules); break; case CartType.osCommerce: case CartType.PrestaShop: case CartType.Other: break; case CartType.Test: cart = new TestExtractor(rules); break; } return cart; }
public void GetData(SiteRules.ExtractorSchedule ut) { GetData(ut.ExtractType); }
public void InitRules(XElement oldRulesXml, string alias, int tier, int validMonths) { //setup rules if (oldRulesXml != null) { MigrationAlias = Input.GetValue(oldRulesXml, "alias"); OldRules = new SiteRules(alias, tier, oldRulesXml); } else if (Use4TellCatalog && Use4TellSales) //create generic TabbedFeed rules to read 4-Tell files { MigrationAlias = alias; OldRules = new SiteRules(alias, "", (BoostTier)tier, CartType.TabbedFeed, "", false); OldRules.ApiExtraHeaders = 1; OldRules.CatalogFeedUrl = "file:Catalog.txt"; var cartRules = OldRules.ReadCartRules(CartType.TabbedFeed); OldRules.Fields.InitializeFields(cartRules, true); } else { Enabled = false; return; } CheckDate(validMonths); }
/// <summary> /// Fetch images from the site Catagories page based on the DisplayType /// </summary> /// <param name="catUrls"></param> /// <param name="imageCatalog"></param> /// <param name="itemCount"></param> /// <param name="progress"></param> internal static void LoadProductImagesForCategory(SiteRules rules, List<string> catUrls, ref Dictionary<string, string> imageCatalog, int itemCount, ExtractorProgress progress ) { var nodeSelector = rules.ProductNodeSelector; //Xpath to find product nodes if (string.IsNullOrEmpty(nodeSelector)) nodeSelector = "//div[@class='ProductImage QuickView']//img[contains(@src,'products')]"; //determine whether the categories have a tree or flat structure var displayType = rules.ForceCategoryTreeScrape ? CategoryDisplayType.Tree : CategoryDisplayType.Flat; var maxPages = 1; var uri = new Uri(String.Format("http://{0}/categories?sort=priceasc&page=1", rules.StoreShortUrl)); var thisWeb = new HtmlWeb(); var thisDoc = thisWeb.Load(uri.AbsoluteUri); var searchSpring = false; //while (!rules.ForceCategoryTreeScrape) //NOTE: could skip while loop in this case, but left out for testing while (true) //single-pass loop to set display type { //check for use of searchsprring --Ajax response is not seen by HtmlAgilityPack var matching = thisDoc.DocumentNode.SelectNodes("//div[@class='searchspring-results_container']"); //var matching = thisDoc.DocumentNode.SelectNodes("//div[@id='searchspring-main']"); if (matching != null) { searchSpring = true; matching = thisDoc.DocumentNode.SelectNodes("//div[@id='searchspring-main']"); if (matching == null) break; if (!matching[0].Attributes["class"].Value.Contains("searchspring-no_results")) //var noResults = thisDoc.DocumentNode.SelectNodes("//div[@class='searchspring-no_results']"); //if (noResults == null) break; displayType = CategoryDisplayType.Tree; break; } matching = thisDoc.DocumentNode.SelectNodes(nodeSelector); if (matching != null) break; var testSelector = "//div[starts-with(@class,'ProductImage')]"; //"//div[@class='ProductImage']//img[@src]"; matching = thisDoc.DocumentNode.SelectNodes(testSelector); if (matching != null) { nodeSelector = testSelector; break; } displayType = CategoryDisplayType.Tree; break; } var pageStatusFormat = progress.ExtraStatus + " --page {0} of {1} (max)"; if (displayType == CategoryDisplayType.Flat) { // we have a catalog setup with a cover page, so we need to resolve the Catalog first. //get page count if (searchSpring) { var totalPages = thisDoc.DocumentNode.SelectNodes("//div[@class='searchspring-total_pages']"); if (totalPages != null) maxPages = Input.SafeIntConvert(totalPages[0].InnerText); } else { //estimate max pages //var nodes = thisDoc.DocumentNode.SelectNodes("//div[@id='CategoryContent']//ul/li"); var nodes = thisDoc.DocumentNode.SelectNodes(nodeSelector); var imagesPerPage = nodes == null ? 1 : nodes.Count; maxPages = (int) Math.Ceiling((decimal) itemCount/(decimal) imagesPerPage); } //do // loop through pages incrementally until we dont have anythign to match for (var page = 1; page <= maxPages; page++) { try { //we already got the first page above, so process it and then grab the next var pageStatus = string.Format(pageStatusFormat, page, maxPages); progress.UpdateTask(imageCatalog.Count, -1, null, pageStatus); var imageNodes = thisDoc.DocumentNode.SelectNodes(nodeSelector); if (imageNodes == null) break; // not all products usually have images, so this could end before we run out of pages var imageUrl = ""; var pid = ""; foreach (var node in imageNodes) { imageUrl = GetUrlFromMatch(node, out pid, ref rules); if (string.IsNullOrEmpty(pid)) continue; if (!imageCatalog.ContainsKey(pid)) { imageCatalog.Add(pid, imageUrl); progress.UpdateTask(imageCatalog.Count); } } // we have a catalog setup with a cover page, so we need to resolve the Catalog first. uri = new Uri(String.Format("http://{0}/categories?sort=priceasc&page={1}", rules.StoreShortUrl, page + 1)); thisDoc = thisWeb.Load(uri.AbsoluteUri); } catch (Exception ex) { Debug.WriteLine(ex.Message); } } } else //Tree structure { try { var catCount = 0; var totalCats = catUrls.Count; foreach (var catUrl in catUrls) { var details = string.Format("Pulling images for {0} ({1} of {2} categories)", catUrl.Replace("/", ""), ++catCount, totalCats); progress.UpdateTask(imageCatalog.Count, -1, null, details); //get the first page of results var thisCatUrl = String.Format("http://{0}{1}?sort=priceasc", rules.StoreShortUrl, catUrl); var catUri = new Uri(thisCatUrl); try { thisDoc = thisWeb.Load(catUri.AbsoluteUri); } catch (Exception ex) { if (BoostLog.Instance != null) BoostLog.Instance.WriteEntry(EventLogEntryType.Information, "Crawler: Unable to Load " + catUri.AbsoluteUri, ex); continue; } //get page count maxPages = 0; if (searchSpring) { var totalPages = thisDoc.DocumentNode.SelectNodes("//div[@class='searchspring-total_pages']"); if (totalPages != null) maxPages = Input.SafeIntConvert(totalPages[0].InnerText); } if (maxPages < 1) { //TODO: See if there is a better way to calculate pages. maxPages = itemCount/10; } var page = 1; do { var matching = thisDoc.DocumentNode.SelectNodes(nodeSelector); //var matching = thisDoc.DocumentNode.SelectNodes("//div[starts-with(@class,'ProductImage')]//img[@src]"); //if (matching == null) // matching = thisDoc.DocumentNode.SelectNodes("//div[@class='ProductImage[QuickView]?')]//img[@src]"); //if (matching == null) // matching = thisDoc.DocumentNode.SelectNodes("//div[@class='ProductImage']//img[@src]"); //if (matching == null) // matching = thisDoc.DocumentNode.SelectNodes("//div[@class='ProductImage QuickView']//img[@src]"); if (!string.IsNullOrEmpty(rules.CommentParseKey)) //sometimes the nodes we want are hidden in a comment { var commentNodes = thisDoc.DocumentNode.SelectNodes(string.Format("//comment()[contains(., {0})]", rules.CommentParseKey)); if (commentNodes != null) { foreach (var c in commentNodes) { try { var comment = new HtmlDocument(); comment.LoadHtml(c.InnerHtml.Replace("<!--", "").Replace("-->", "")); var partialMatch = comment.DocumentNode.SelectNodes(nodeSelector); if (partialMatch != null) if (matching == null) matching = partialMatch; else foreach (var match in partialMatch) matching.Add(match); } catch (Exception ex) { if (BoostLog.Instance != null) BoostLog.Instance.WriteEntry(EventLogEntryType.Information, "Crawler: Unable to parse comment node", ex); } } } } if (matching == null) break; var matchCount = matching.Count; var oldCount = imageCatalog.Count; var imageUrl = ""; var pid = ""; foreach (var node in matching) { imageUrl = GetUrlFromMatch(node, out pid, ref rules); if (string.IsNullOrEmpty(pid)) continue; if (!imageCatalog.ContainsKey(pid)) { imageCatalog.Add(pid, imageUrl); progress.UpdateTask(imageCatalog.Count); } } if (imageCatalog.Count == oldCount) break; //no new images found var pageStatus = string.Format(pageStatusFormat, page, maxPages); progress.UpdateTask(imageCatalog.Count, -1, null, details + pageStatus); if (++page > maxPages) break; thisCatUrl = String.Format("http://{0}{1}?sort=priceasc&page={2}", rules.StoreShortUrl, catUrl, page); catUri = new Uri(thisCatUrl); thisDoc = thisWeb.Load(catUri.AbsoluteUri); } while (true); } } catch (Exception ex) { Debug.WriteLine(ex.Message); } } }
public TabbedFeedExtractor(SiteRules rules) : base(rules) { }
private Dictionary<string, ParentItem> _parentProducts; //key is product id #endregion Fields #region Constructors public CatalogHandler(SiteRules rules, CartExtractor cart, ExtractorProgress progress) : base(rules, cart, progress, DataGroup.Catalog) { }
public InventoryHandler(SiteRules rules, CartExtractor cart, ExtractorProgress progress) : base(rules, cart, progress, DataGroup.Inventory) { }
public BigCommerceExtractor(SiteRules rules) : base(rules) { //TODO: Implement ValidateCredentials for all carts and move this check to the base class var status = ""; HasCredentials = ValidateCredentials(out status); if (!HasCredentials && BoostLog.Instance != null) BoostLog.Instance.WriteEntry(EventLogEntryType.Error, "Invalid BigCommerce API Credentials", status, Alias); else // determine feed types SetFeedTypes(); adaptor = new BigCommerceAdaptor(_apiKey, _userName, Rules.ApiUrl, this); }
public Brands(SiteRules Rules) { d = new Dictionary<string, string>(); this.Rules = Rules; }