Пример #1
0
        /// <summary>
        /// Extract the URL from a mached HtmlNode and add it to the catalog (if not already there)
        /// </summary>
        /// <param name="node"></param>
        /// <param name="imageCatalog"></param>
        /// <returns></returns>
        internal static string GetUrlFromMatch(HtmlNode node, out string pid, ref SiteRules rules)
        {
            pid = null;

            //stamdard link looks like this
            //http://cdn2.bigcommerce.com/server3300/46048/products/10676/images/91127/Hunter_Original_Tall_Boot_Aubergine_4__95507.1343675413.185.279.jpg
            var imageUrlSelector = rules.ImageUrlSelector;
            var imageUrlPrefix = rules.ImageUrlPrefix;
            if (string.IsNullOrEmpty(imageUrlPrefix))
                imageUrlPrefix = "src=\"";
            var imageUrlSuffix = rules.ImageUrlSuffix;
            if (string.IsNullOrEmpty(imageUrlSuffix))
                imageUrlSuffix = "\"";
            var pidSelector = rules.PidSelector;
            var pidPrefix = rules.PidPrefix;
            if (string.IsNullOrEmpty(pidPrefix))
                pidPrefix = "/products/";
            var pidSuffix = rules.PidSuffix;
            if (string.IsNullOrEmpty(pidSuffix))
                pidSuffix = "/";

            //find the image url
            HtmlNode imageNode;
            if (string.IsNullOrEmpty(imageUrlSelector))
                imageNode = node;
            else
            {
                var nodeList = node.SelectNodes(imageUrlSelector);
                imageNode = nodeList == null ? node : nodeList.FirstOrDefault();
            }
            if (imageNode == null) return "";
            var html = imageNode.OuterHtml;
            var src = html.IndexOf(imageUrlPrefix);
            if (src < 0) return "";
            src += imageUrlPrefix.Length;
            var start = html.IndexOf("//", src); //skip protocol
            if (start < 0) start = src;
            var end = html.IndexOf(imageUrlSuffix, start);
            var url = end < 0 ? html.Substring(start) : html.Substring(start, end - start);

            //find the product id
            HtmlNode pidNode;
            if (string.IsNullOrEmpty(pidSelector))
                pidNode = node;
            else
            {
                var nodeList = node.SelectNodes(pidSelector);
                pidNode = nodeList == null ? node : nodeList.FirstOrDefault();
            }
            if (pidNode == null) return url;
            html = pidNode.OuterHtml;
            start = html.IndexOf(pidPrefix);
            if (start < 0) return url;
            start += pidPrefix.Length;
            end = html.IndexOf(pidSuffix, start);
            pid = end < 0 ? html.Substring(start) : html.Substring(start, end - start);

            return url;
        }
 public DataHandlerBase(SiteRules rules, CartExtractor cart, ExtractorProgress progress, DataGroup group)
 {
     _cart = cart;
     _rules = rules;
     _progress = progress;
     _group = group;
     _exportDate = DateTime.Now;
     _exportDateInitialized = false;
     _migrationSlave = _rules.MigrationRules != null && _rules.MigrationRules.Enabled
                                         && !_rules.MigrationRules.IsMigrationMaster;
     Reset();
 }
        public XmlFeedExtractor(SiteRules rules)
            : base(rules)
        {
            //determine feed type
            SetFeedTypes();

            //if api key is not provided, assume license key instead
            if (string.IsNullOrEmpty(Rules.ApiKey))
            {
                Rules.ApiKey = ClientData.Instance.GetServiceKey(Alias);
            }
        }
 public ShopifyExtractor(SiteRules rules)
     : base(rules)
 {
     //TODO: Implement ValidateCredentials for all carts and move this check to the base class
     var status = "";
     HasCredentials = ValidateCredentials(out status);
     if (!HasCredentials && BoostLog.Instance != null)
         BoostLog.Instance.WriteEntry(EventLogEntryType.Error, "Invalid Shopify API Credentials", status, Alias);
     else
         // determine feed types
         SetFeedTypes();
 }
        public void SaveSiteRules(string alias, XElement settings, SiteRules rules = null, WebContextProxy wc = null)
        {
            try
            {
            #if DEBUG
                if (BoostLog.Instance != null && wc != null)
                    BoostLog.Instance.WriteEntry(EventLogEntryType.Information,
                        string.Format("Saving Rules for {0}", alias), "Context:\n" + wc.ToString(), alias);
            #endif
                var path = DataPath.Instance.ClientDataPath(alias, true);
                if (!Directory.Exists(path)) Directory.CreateDirectory(path);
                settings.Save(path + SiteRulesFileName);

                //check to see if we should update BoostConfigOverride (used when customers upload data directly to us)
                if (rules == null || rules.CartExtractorExists || TableAccess.Instance == null) return;

                var data = rules.FormatGeneratorConfig();
                TableAccess.Instance.WriteTable(alias, "ConfigBoostOverride.txt", data);
            }
            catch { }
        }
 public bool SetRules(SiteRules rules)
 {
     if (!rules.Alias.Equals(Rules.Alias)) return false; //can't change the alias
     Rules = rules;
     //FillDefaultFieldNames();
     //Rules.InitFieldNames();
     return true;
 }
        protected CartExtractor(SiteRules rules)
        {
            if (rules == null) throw new NoNullAllowedException("CartExtractor Rules cannot be null");
            Rules = rules;
            Alias = Rules.Alias;
            DataReadPath = IO.DataPath.Instance.ClientDataPath(ref Alias, true);
            DataWritePath = IO.DataPath.Instance.ClientDataPath(Alias, true);
            Progress = new ExtractorProgress();
            IsExtractorQueued = false;
            IsExtracting = false;

            #if !CART_EXTRACTOR_TEST_SITE
            BoostService = RestAccess.Instance;
            FeaturedCrossSells = new FeaturedRecommendations();
            FeaturedUpSells = new FeaturedRecommendations();
            #endif
            ExclusionStats = new Dictionary<string, int>();
            Catalog = new CatalogHandler(Rules, this, Progress);
            Inventory = new InventoryHandler(Rules, this, Progress);
            Sales = new SalesHandler(Rules, this, Progress);
            Customers = new CustomerHandler(Rules, this, Progress);
            CategoryNames = new AttributeHandler(Rules, this, Progress, DataGroup.CategoryNames);
            BrandNames = new AttributeHandler(Rules, this, Progress, DataGroup.ManufacturerNames);
            DepartmentNames = new AttributeHandler(Rules, this, Progress, DataGroup.DepartmentNames);

            //TODO: Move all below to the CatalogHandler class or depricate
            Exclusions = new List<ExclusionRecord>();
            Replacements = new List<ReplacementRecord>();
            ParentProducts = new Dictionary<string, ParentItem>();
            AltPrices = new Dictionary<string, List<string>>();
            AltPageLinks = new Dictionary<string, List<string>>();
            AltImageLinks = new Dictionary<string, List<string>>();
            AltTitles = new Dictionary<string, List<string>>();
            Departments = new Dictionary<string, string>();
            ExclusionCauses = new Dictionary<string, string>();
        }
 public Categories(SiteRules Rules)
 {
     d = new Dictionary<string, string>();
     this.Rules = Rules;
 }
 public SalesHandler(SiteRules rules, CartExtractor cart, ExtractorProgress progress)
     : base(rules, cart, progress, DataGroup.Sales)
 {
 }
 public AttributeHandler(SiteRules rules, CartExtractor cart, ExtractorProgress progress, DataGroup group)
     : base(rules, cart, progress, group)
 {
 }
 private CartExtractor GetCart()
 {
     string alias = TextBoxClientAlias.Text;
     XElement settings = ReadSiteRules(alias, "upload");
     if (settings == null)
     {
         TextBoxResults.Text = "Error Reading Site Rules\n";
         UpdatePanelResults.Update();
         return null;
     }
     SiteRules rules = new SiteRules(alias, 1, settings);
     return CartExtractor.GetCart(rules);
 }
 public CustomerHandler(SiteRules rules, CartExtractor cart, ExtractorProgress progress)
     : base(rules, cart, progress, DataGroup.Customers)
 {
 }
 public Catalog(SiteRules Rules, bool inventoryOnly)
 {
     this.Rules = Rules;
     this.inventoryOnly = inventoryOnly;
 }
 public Sales(SiteRules Rules)
 {
     List<string> header = new List<string>();
     header.Add(Rules.Fields.GetName(FieldName.OrderId));
     header.Add(Rules.Fields.GetName(FieldName.OrderProductId));
     header.Add(Rules.Fields.GetName(FieldName.OrderCustomerId));
     header.Add(Rules.Fields.GetName(FieldName.OrderQuantity));
     header.Add(Rules.Fields.GetName(FieldName.OrderDate));
     output.Add(header);
 }
 public Customers(SiteRules Rules)
 {
     List<string> header = new List<string>();
     header.Add(Rules.Fields.GetName(FieldName.CustomerId));
     header.Add("Email");
     header.Add("Name");
     header.Add("Address");
     header.Add("City");
     header.Add("State");
     header.Add("PostalCode");
     header.Add("Country");
     header.Add("Phone");
     header.Add("Gender");
     header.Add("Birthday");
     header.Add("AgeRange");
     header.Add("AlternativeIDs");
     header.Add("DoNotTrack");
     output.Add(header);
 }
 public static CartExtractor GetCart(SiteRules rules)
 {
     CartExtractor cart = null;
     switch (rules.CartType)
     {
     #if !CART_EXTRACTOR_TEST_SITE
         case CartType.ThreeDCart:
             cart = new ThreeDCartExtractor(rules);
             break;
     #endif
         case CartType.BigCommerce:
             cart = new BigCommerceExtractor(rules);
             break;
         case CartType.MivaMerchant:
             cart = new MivaMerchantExtractor(rules);
             break;
         case CartType.Volusion:
             cart = new VolusionExtractor(rules);
             break;
         case CartType.CommerceV3:
             cart = new CommerceV3Extractor(rules);
             break;
         case CartType.Magento:
             if (rules.PluginVersion > 3) goto case CartType.JsonFeed;
             break;
         case CartType.Shopify:
             cart = new ShopifyExtractor(rules);
             break;
         //case CartType.NetSuite:
         //    cart = new NetSuiteExtractor(rules);
         //    break;
         case CartType.AspDotNetStorefront:
         case CartType.WebsitePipeline:
         case CartType.TabbedFeed:
         case CartType.JsonFeed:
             cart = new JsonFeedExtractor(rules);
             break;
         case CartType.XmlFeed:
             cart = new XmlFeedExtractor(rules);
             break;
         case CartType.osCommerce:
         case CartType.PrestaShop:
         case CartType.Other:
             break;
         case CartType.Test:
             cart = new TestExtractor(rules);
             break;
     }
     return cart;
 }
 public void GetData(SiteRules.ExtractorSchedule ut)
 {
     GetData(ut.ExtractType);
 }
 public void InitRules(XElement oldRulesXml, string alias, int tier, int validMonths)
 {
     //setup rules
     if (oldRulesXml != null)
     {
         MigrationAlias = Input.GetValue(oldRulesXml, "alias");
         OldRules = new SiteRules(alias, tier, oldRulesXml);
     }
     else if (Use4TellCatalog && Use4TellSales) //create generic TabbedFeed rules to read 4-Tell files
     {
         MigrationAlias = alias;
         OldRules = new SiteRules(alias, "", (BoostTier)tier, CartType.TabbedFeed, "", false);
         OldRules.ApiExtraHeaders = 1;
         OldRules.CatalogFeedUrl = "file:Catalog.txt";
         var cartRules = OldRules.ReadCartRules(CartType.TabbedFeed);
         OldRules.Fields.InitializeFields(cartRules, true);
     }
     else
     {
         Enabled = false;
         return;
     }
     CheckDate(validMonths);
 }
Пример #19
0
        /// <summary>
        /// Fetch images from the site Catagories page based on the DisplayType 
        /// </summary>
        /// <param name="catUrls"></param>
        /// <param name="imageCatalog"></param>
        /// <param name="itemCount"></param>
        /// <param name="progress"></param>
        internal static void LoadProductImagesForCategory(SiteRules rules, List<string> catUrls,
		                                                  ref Dictionary<string, string> imageCatalog,
		                                                  int itemCount, ExtractorProgress progress
			)
        {
            var nodeSelector = rules.ProductNodeSelector; //Xpath to find product nodes
            if (string.IsNullOrEmpty(nodeSelector))
                nodeSelector = "//div[@class='ProductImage QuickView']//img[contains(@src,'products')]";

            //determine whether the categories have a tree or flat structure
            var displayType = rules.ForceCategoryTreeScrape ? CategoryDisplayType.Tree : CategoryDisplayType.Flat;
            var maxPages = 1;

            var uri = new Uri(String.Format("http://{0}/categories?sort=priceasc&page=1", rules.StoreShortUrl));
            var thisWeb = new HtmlWeb();
            var thisDoc = thisWeb.Load(uri.AbsoluteUri);
            var searchSpring = false;
            //while (!rules.ForceCategoryTreeScrape) //NOTE: could skip while loop in this case, but left out for testing
            while (true) //single-pass loop to set display type
            {
                //check for use of searchsprring --Ajax response is not seen by HtmlAgilityPack
                var matching = thisDoc.DocumentNode.SelectNodes("//div[@class='searchspring-results_container']");
                //var matching = thisDoc.DocumentNode.SelectNodes("//div[@id='searchspring-main']");
                if (matching != null)
                {
                    searchSpring = true;
                    matching = thisDoc.DocumentNode.SelectNodes("//div[@id='searchspring-main']");
                    if (matching == null)
                        break;
                    if (!matching[0].Attributes["class"].Value.Contains("searchspring-no_results"))
                        //var noResults = thisDoc.DocumentNode.SelectNodes("//div[@class='searchspring-no_results']");
                        //if (noResults == null)
                        break;

                    displayType = CategoryDisplayType.Tree;
                    break;
                }

                matching = thisDoc.DocumentNode.SelectNodes(nodeSelector);
                if (matching != null) break;
                var testSelector = "//div[starts-with(@class,'ProductImage')]";
                    //"//div[@class='ProductImage']//img[@src]";
                matching = thisDoc.DocumentNode.SelectNodes(testSelector);
                if (matching != null)
                {
                    nodeSelector = testSelector;
                    break;
                }

                displayType = CategoryDisplayType.Tree;
                break;
            }

            var pageStatusFormat = progress.ExtraStatus + " --page {0} of {1} (max)";
            if (displayType == CategoryDisplayType.Flat)
            {
                // we have a catalog setup with a cover page, so we need to resolve the Catalog first.

                //get page count
                if (searchSpring)
                {
                    var totalPages = thisDoc.DocumentNode.SelectNodes("//div[@class='searchspring-total_pages']");
                    if (totalPages != null)
                        maxPages = Input.SafeIntConvert(totalPages[0].InnerText);
                }
                else
                {
                    //estimate max pages
                    //var nodes = thisDoc.DocumentNode.SelectNodes("//div[@id='CategoryContent']//ul/li");
                    var nodes = thisDoc.DocumentNode.SelectNodes(nodeSelector);
                    var imagesPerPage = nodes == null ? 1 : nodes.Count;
                    maxPages = (int) Math.Ceiling((decimal) itemCount/(decimal) imagesPerPage);
                }

                //do // loop through pages incrementally until we dont have anythign to match
                for (var page = 1; page <= maxPages; page++)
                {
                    try
                    {
                        //we already got the first page above, so process it and then grab the next
                        var pageStatus = string.Format(pageStatusFormat, page, maxPages);
                        progress.UpdateTask(imageCatalog.Count, -1, null, pageStatus);

                        var imageNodes = thisDoc.DocumentNode.SelectNodes(nodeSelector);
                        if (imageNodes == null)
                            break; // not all products usually have images, so this could end before we run out of pages

                        var imageUrl = "";
                        var pid = "";
                        foreach (var node in imageNodes)
                        {
                            imageUrl = GetUrlFromMatch(node, out pid, ref rules);
                            if (string.IsNullOrEmpty(pid)) continue;

                            if (!imageCatalog.ContainsKey(pid))
                            {
                                imageCatalog.Add(pid, imageUrl);
                                progress.UpdateTask(imageCatalog.Count);
                            }
                        }

                        // we have a catalog setup with a cover page, so we need to resolve the Catalog first.
                        uri = new Uri(String.Format("http://{0}/categories?sort=priceasc&page={1}", rules.StoreShortUrl, page + 1));
                        thisDoc = thisWeb.Load(uri.AbsoluteUri);
                    }
                    catch (Exception ex)
                    {
                        Debug.WriteLine(ex.Message);
                    }
                }
            }
            else //Tree structure
            {
                try
                {
                    var catCount = 0;
                    var totalCats = catUrls.Count;
                    foreach (var catUrl in catUrls)
                    {
                        var details = string.Format("Pulling images for {0} ({1} of {2} categories)", catUrl.Replace("/", ""), ++catCount,
                                                    totalCats);
                        progress.UpdateTask(imageCatalog.Count, -1, null, details);

                        //get the first page of results
                        var thisCatUrl = String.Format("http://{0}{1}?sort=priceasc", rules.StoreShortUrl, catUrl);
                        var catUri = new Uri(thisCatUrl);
                        try
                        {
                            thisDoc = thisWeb.Load(catUri.AbsoluteUri);
                        }
                        catch (Exception ex)
                        {
                            if (BoostLog.Instance != null)
                                BoostLog.Instance.WriteEntry(EventLogEntryType.Information, "Crawler: Unable to Load " + catUri.AbsoluteUri, ex);
                            continue;
                        }
                        //get page count
                        maxPages = 0;
                        if (searchSpring)
                        {
                            var totalPages = thisDoc.DocumentNode.SelectNodes("//div[@class='searchspring-total_pages']");
                            if (totalPages != null)
                                maxPages = Input.SafeIntConvert(totalPages[0].InnerText);

                        }
                        if (maxPages < 1)
                        {
                            //TODO: See if there is a better way to calculate pages.
                            maxPages = itemCount/10;
                        }
                        var page = 1;
                        do
                        {
                            var matching = thisDoc.DocumentNode.SelectNodes(nodeSelector);
                            //var matching = thisDoc.DocumentNode.SelectNodes("//div[starts-with(@class,'ProductImage')]//img[@src]");
                            //if (matching == null)
                            //  matching = thisDoc.DocumentNode.SelectNodes("//div[@class='ProductImage[QuickView]?')]//img[@src]");
                            //if (matching == null)
                            //  matching = thisDoc.DocumentNode.SelectNodes("//div[@class='ProductImage']//img[@src]");
                            //if (matching == null)
                            //  matching = thisDoc.DocumentNode.SelectNodes("//div[@class='ProductImage QuickView']//img[@src]");
                            if (!string.IsNullOrEmpty(rules.CommentParseKey)) //sometimes the nodes we want are hidden in a comment
                            {
                                var commentNodes = thisDoc.DocumentNode.SelectNodes(string.Format("//comment()[contains(., {0})]", rules.CommentParseKey));
                                if (commentNodes != null)
                                {
                                    foreach (var c in commentNodes)
                                    {
                                        try
                                        {
                                            var comment = new HtmlDocument();
                                            comment.LoadHtml(c.InnerHtml.Replace("<!--", "").Replace("-->", ""));
                                            var partialMatch = comment.DocumentNode.SelectNodes(nodeSelector);
                                            if (partialMatch != null)
                                                if (matching == null) matching = partialMatch;
                                                else
                                                    foreach (var match in partialMatch)
                                                        matching.Add(match);
                                        }
                                        catch (Exception ex)
                                        {
                                            if (BoostLog.Instance != null)
                                                BoostLog.Instance.WriteEntry(EventLogEntryType.Information, "Crawler: Unable to parse comment node", ex);
                                        }
                                    }
                                }
                            }
                            if (matching == null) break;

                            var matchCount = matching.Count;
                            var oldCount = imageCatalog.Count;
                            var imageUrl = "";
                            var pid = "";
                            foreach (var node in matching)
                            {
                                imageUrl = GetUrlFromMatch(node, out pid, ref rules);
                                if (string.IsNullOrEmpty(pid)) continue;

                                if (!imageCatalog.ContainsKey(pid))
                                {
                                    imageCatalog.Add(pid, imageUrl);
                                    progress.UpdateTask(imageCatalog.Count);
                                }
                            }
                            if (imageCatalog.Count == oldCount) break; //no new images found

                            var pageStatus = string.Format(pageStatusFormat, page, maxPages);
                            progress.UpdateTask(imageCatalog.Count, -1, null, details + pageStatus);
                            if (++page > maxPages) break;

                            thisCatUrl = String.Format("http://{0}{1}?sort=priceasc&page={2}", rules.StoreShortUrl, catUrl, page);
                            catUri = new Uri(thisCatUrl);
                            thisDoc = thisWeb.Load(catUri.AbsoluteUri);

                        } while (true);
                    }
                }
                catch (Exception ex)
                {
                    Debug.WriteLine(ex.Message);
                }
            }
        }
 public TabbedFeedExtractor(SiteRules rules)
     : base(rules)
 {
 }
        private Dictionary<string, ParentItem> _parentProducts; //key is product id

        #endregion Fields

        #region Constructors

        public CatalogHandler(SiteRules rules, CartExtractor cart, ExtractorProgress progress)
            : base(rules, cart, progress, DataGroup.Catalog)
        {
        }
 public InventoryHandler(SiteRules rules, CartExtractor cart, ExtractorProgress progress)
     : base(rules, cart, progress, DataGroup.Inventory)
 {
 }
        public BigCommerceExtractor(SiteRules rules)
            : base(rules)
        {
            //TODO: Implement ValidateCredentials for all carts and move this check to the base class
            var status = "";
            HasCredentials = ValidateCredentials(out status);
            if (!HasCredentials && BoostLog.Instance != null)
                BoostLog.Instance.WriteEntry(EventLogEntryType.Error, "Invalid BigCommerce API Credentials", status, Alias);
            else
                // determine feed types
                SetFeedTypes();

            adaptor = new BigCommerceAdaptor(_apiKey, _userName, Rules.ApiUrl, this);
        }
 public Brands(SiteRules Rules)
 {
     d = new Dictionary<string, string>();
     this.Rules = Rules;
 }