public DataHandlerBase(SiteRules rules, CartExtractor cart, ExtractorProgress progress, DataGroup group)
 {
     _cart = cart;
     _rules = rules;
     _progress = progress;
     _group = group;
     _exportDate = DateTime.Now;
     _exportDateInitialized = false;
     _migrationSlave = _rules.MigrationRules != null && _rules.MigrationRules.Enabled
                                         && !_rules.MigrationRules.IsMigrationMaster;
     Reset();
 }
 public SalesHandler(SiteRules rules, CartExtractor cart, ExtractorProgress progress)
     : base(rules, cart, progress, DataGroup.Sales)
 {
 }
        private Dictionary<string, ParentItem> _parentProducts; //key is product id

        #endregion Fields

        #region Constructors

        public CatalogHandler(SiteRules rules, CartExtractor cart, ExtractorProgress progress)
            : base(rules, cart, progress, DataGroup.Catalog)
        {
        }
Ejemplo n.º 4
0
        /// <summary>
        /// Fetch images from the site Catagories page based on the DisplayType 
        /// </summary>
        /// <param name="catUrls"></param>
        /// <param name="imageCatalog"></param>
        /// <param name="itemCount"></param>
        /// <param name="progress"></param>
        internal static void LoadProductImagesForCategory(SiteRules rules, List<string> catUrls,
		                                                  ref Dictionary<string, string> imageCatalog,
		                                                  int itemCount, ExtractorProgress progress
			)
        {
            var nodeSelector = rules.ProductNodeSelector; //Xpath to find product nodes
            if (string.IsNullOrEmpty(nodeSelector))
                nodeSelector = "//div[@class='ProductImage QuickView']//img[contains(@src,'products')]";

            //determine whether the categories have a tree or flat structure
            var displayType = rules.ForceCategoryTreeScrape ? CategoryDisplayType.Tree : CategoryDisplayType.Flat;
            var maxPages = 1;

            var uri = new Uri(String.Format("http://{0}/categories?sort=priceasc&page=1", rules.StoreShortUrl));
            var thisWeb = new HtmlWeb();
            var thisDoc = thisWeb.Load(uri.AbsoluteUri);
            var searchSpring = false;
            //while (!rules.ForceCategoryTreeScrape) //NOTE: could skip while loop in this case, but left out for testing
            while (true) //single-pass loop to set display type
            {
                //check for use of searchsprring --Ajax response is not seen by HtmlAgilityPack
                var matching = thisDoc.DocumentNode.SelectNodes("//div[@class='searchspring-results_container']");
                //var matching = thisDoc.DocumentNode.SelectNodes("//div[@id='searchspring-main']");
                if (matching != null)
                {
                    searchSpring = true;
                    matching = thisDoc.DocumentNode.SelectNodes("//div[@id='searchspring-main']");
                    if (matching == null)
                        break;
                    if (!matching[0].Attributes["class"].Value.Contains("searchspring-no_results"))
                        //var noResults = thisDoc.DocumentNode.SelectNodes("//div[@class='searchspring-no_results']");
                        //if (noResults == null)
                        break;

                    displayType = CategoryDisplayType.Tree;
                    break;
                }

                matching = thisDoc.DocumentNode.SelectNodes(nodeSelector);
                if (matching != null) break;
                var testSelector = "//div[starts-with(@class,'ProductImage')]";
                    //"//div[@class='ProductImage']//img[@src]";
                matching = thisDoc.DocumentNode.SelectNodes(testSelector);
                if (matching != null)
                {
                    nodeSelector = testSelector;
                    break;
                }

                displayType = CategoryDisplayType.Tree;
                break;
            }

            var pageStatusFormat = progress.ExtraStatus + " --page {0} of {1} (max)";
            if (displayType == CategoryDisplayType.Flat)
            {
                // we have a catalog setup with a cover page, so we need to resolve the Catalog first.

                //get page count
                if (searchSpring)
                {
                    var totalPages = thisDoc.DocumentNode.SelectNodes("//div[@class='searchspring-total_pages']");
                    if (totalPages != null)
                        maxPages = Input.SafeIntConvert(totalPages[0].InnerText);
                }
                else
                {
                    //estimate max pages
                    //var nodes = thisDoc.DocumentNode.SelectNodes("//div[@id='CategoryContent']//ul/li");
                    var nodes = thisDoc.DocumentNode.SelectNodes(nodeSelector);
                    var imagesPerPage = nodes == null ? 1 : nodes.Count;
                    maxPages = (int) Math.Ceiling((decimal) itemCount/(decimal) imagesPerPage);
                }

                //do // loop through pages incrementally until we dont have anythign to match
                for (var page = 1; page <= maxPages; page++)
                {
                    try
                    {
                        //we already got the first page above, so process it and then grab the next
                        var pageStatus = string.Format(pageStatusFormat, page, maxPages);
                        progress.UpdateTask(imageCatalog.Count, -1, null, pageStatus);

                        var imageNodes = thisDoc.DocumentNode.SelectNodes(nodeSelector);
                        if (imageNodes == null)
                            break; // not all products usually have images, so this could end before we run out of pages

                        var imageUrl = "";
                        var pid = "";
                        foreach (var node in imageNodes)
                        {
                            imageUrl = GetUrlFromMatch(node, out pid, ref rules);
                            if (string.IsNullOrEmpty(pid)) continue;

                            if (!imageCatalog.ContainsKey(pid))
                            {
                                imageCatalog.Add(pid, imageUrl);
                                progress.UpdateTask(imageCatalog.Count);
                            }
                        }

                        // we have a catalog setup with a cover page, so we need to resolve the Catalog first.
                        uri = new Uri(String.Format("http://{0}/categories?sort=priceasc&page={1}", rules.StoreShortUrl, page + 1));
                        thisDoc = thisWeb.Load(uri.AbsoluteUri);
                    }
                    catch (Exception ex)
                    {
                        Debug.WriteLine(ex.Message);
                    }
                }
            }
            else //Tree structure
            {
                try
                {
                    var catCount = 0;
                    var totalCats = catUrls.Count;
                    foreach (var catUrl in catUrls)
                    {
                        var details = string.Format("Pulling images for {0} ({1} of {2} categories)", catUrl.Replace("/", ""), ++catCount,
                                                    totalCats);
                        progress.UpdateTask(imageCatalog.Count, -1, null, details);

                        //get the first page of results
                        var thisCatUrl = String.Format("http://{0}{1}?sort=priceasc", rules.StoreShortUrl, catUrl);
                        var catUri = new Uri(thisCatUrl);
                        try
                        {
                            thisDoc = thisWeb.Load(catUri.AbsoluteUri);
                        }
                        catch (Exception ex)
                        {
                            if (BoostLog.Instance != null)
                                BoostLog.Instance.WriteEntry(EventLogEntryType.Information, "Crawler: Unable to Load " + catUri.AbsoluteUri, ex);
                            continue;
                        }
                        //get page count
                        maxPages = 0;
                        if (searchSpring)
                        {
                            var totalPages = thisDoc.DocumentNode.SelectNodes("//div[@class='searchspring-total_pages']");
                            if (totalPages != null)
                                maxPages = Input.SafeIntConvert(totalPages[0].InnerText);

                        }
                        if (maxPages < 1)
                        {
                            //TODO: See if there is a better way to calculate pages.
                            maxPages = itemCount/10;
                        }
                        var page = 1;
                        do
                        {
                            var matching = thisDoc.DocumentNode.SelectNodes(nodeSelector);
                            //var matching = thisDoc.DocumentNode.SelectNodes("//div[starts-with(@class,'ProductImage')]//img[@src]");
                            //if (matching == null)
                            //  matching = thisDoc.DocumentNode.SelectNodes("//div[@class='ProductImage[QuickView]?')]//img[@src]");
                            //if (matching == null)
                            //  matching = thisDoc.DocumentNode.SelectNodes("//div[@class='ProductImage']//img[@src]");
                            //if (matching == null)
                            //  matching = thisDoc.DocumentNode.SelectNodes("//div[@class='ProductImage QuickView']//img[@src]");
                            if (!string.IsNullOrEmpty(rules.CommentParseKey)) //sometimes the nodes we want are hidden in a comment
                            {
                                var commentNodes = thisDoc.DocumentNode.SelectNodes(string.Format("//comment()[contains(., {0})]", rules.CommentParseKey));
                                if (commentNodes != null)
                                {
                                    foreach (var c in commentNodes)
                                    {
                                        try
                                        {
                                            var comment = new HtmlDocument();
                                            comment.LoadHtml(c.InnerHtml.Replace("<!--", "").Replace("-->", ""));
                                            var partialMatch = comment.DocumentNode.SelectNodes(nodeSelector);
                                            if (partialMatch != null)
                                                if (matching == null) matching = partialMatch;
                                                else
                                                    foreach (var match in partialMatch)
                                                        matching.Add(match);
                                        }
                                        catch (Exception ex)
                                        {
                                            if (BoostLog.Instance != null)
                                                BoostLog.Instance.WriteEntry(EventLogEntryType.Information, "Crawler: Unable to parse comment node", ex);
                                        }
                                    }
                                }
                            }
                            if (matching == null) break;

                            var matchCount = matching.Count;
                            var oldCount = imageCatalog.Count;
                            var imageUrl = "";
                            var pid = "";
                            foreach (var node in matching)
                            {
                                imageUrl = GetUrlFromMatch(node, out pid, ref rules);
                                if (string.IsNullOrEmpty(pid)) continue;

                                if (!imageCatalog.ContainsKey(pid))
                                {
                                    imageCatalog.Add(pid, imageUrl);
                                    progress.UpdateTask(imageCatalog.Count);
                                }
                            }
                            if (imageCatalog.Count == oldCount) break; //no new images found

                            var pageStatus = string.Format(pageStatusFormat, page, maxPages);
                            progress.UpdateTask(imageCatalog.Count, -1, null, details + pageStatus);
                            if (++page > maxPages) break;

                            thisCatUrl = String.Format("http://{0}{1}?sort=priceasc&page={2}", rules.StoreShortUrl, catUrl, page);
                            catUri = new Uri(thisCatUrl);
                            thisDoc = thisWeb.Load(catUri.AbsoluteUri);

                        } while (true);
                    }
                }
                catch (Exception ex)
                {
                    Debug.WriteLine(ex.Message);
                }
            }
        }
 public AttributeHandler(SiteRules rules, CartExtractor cart, ExtractorProgress progress, DataGroup group)
     : base(rules, cart, progress, group)
 {
 }
        protected CartExtractor(SiteRules rules)
        {
            if (rules == null) throw new NoNullAllowedException("CartExtractor Rules cannot be null");
            Rules = rules;
            Alias = Rules.Alias;
            DataReadPath = IO.DataPath.Instance.ClientDataPath(ref Alias, true);
            DataWritePath = IO.DataPath.Instance.ClientDataPath(Alias, true);
            Progress = new ExtractorProgress();
            IsExtractorQueued = false;
            IsExtracting = false;

            #if !CART_EXTRACTOR_TEST_SITE
            BoostService = RestAccess.Instance;
            FeaturedCrossSells = new FeaturedRecommendations();
            FeaturedUpSells = new FeaturedRecommendations();
            #endif
            ExclusionStats = new Dictionary<string, int>();
            Catalog = new CatalogHandler(Rules, this, Progress);
            Inventory = new InventoryHandler(Rules, this, Progress);
            Sales = new SalesHandler(Rules, this, Progress);
            Customers = new CustomerHandler(Rules, this, Progress);
            CategoryNames = new AttributeHandler(Rules, this, Progress, DataGroup.CategoryNames);
            BrandNames = new AttributeHandler(Rules, this, Progress, DataGroup.ManufacturerNames);
            DepartmentNames = new AttributeHandler(Rules, this, Progress, DataGroup.DepartmentNames);

            //TODO: Move all below to the CatalogHandler class or depricate
            Exclusions = new List<ExclusionRecord>();
            Replacements = new List<ReplacementRecord>();
            ParentProducts = new Dictionary<string, ParentItem>();
            AltPrices = new Dictionary<string, List<string>>();
            AltPageLinks = new Dictionary<string, List<string>>();
            AltImageLinks = new Dictionary<string, List<string>>();
            AltTitles = new Dictionary<string, List<string>>();
            Departments = new Dictionary<string, string>();
            ExclusionCauses = new Dictionary<string, string>();
        }
 public InventoryHandler(SiteRules rules, CartExtractor cart, ExtractorProgress progress)
     : base(rules, cart, progress, DataGroup.Inventory)
 {
 }
 public void SetMigrationProgress(ExtractorProgress mp)
 {
     _migrationProgress = mp; //TODO:confirm that this is a reference and not a copy
 }
 public void BeginMapping(ExtractorProgress progress)
 {
     Progress = progress;
     if (Enabled)
         MigrationSubMap = new Dictionary<string, string>();
 }
        /// <summary>
        /// Recursive method to read from the feed client. 
        /// Method will retry on timeout if ApiMaxTries is greater than 1
        /// </summary>
        /// <param name="feedUrl"></param>
        /// <param name="tryCount"></param>
        /// <returns></returns>
        public Stream TryOpenRead(string feedUrl, ref ExtractorProgress progress, int tryCount = 1)
        {
            Stream resultStream = null;
            var details = "";
            try
            {
                InitServicePointManager();
                resultStream = OpenRead(feedUrl);
            }
            catch (TimeoutException tex)
            {
                if (resultStream != null) resultStream.Close();
                resultStream = null;
                details = tex.Message;
                if (tryCount >= _config.MaxTries)
                    throw new Exception(details);
            }
            catch (WebException wex)
            {
                if (resultStream != null) resultStream.Close();
                resultStream = null;
                details = wex.Message;
                if (wex.Status != WebExceptionStatus.Timeout || tryCount >= _config.MaxTries)
                    throw new Exception(details);
            }
            #if DEBUG
            catch (Exception ex)
            {
                if (resultStream != null) resultStream.Close();
                resultStream = null;
                details = string.Format("Exception in TryOpenRead (trycount = {0}): {1}", tryCount, ex.Message);
                throw ex;
            }
            finally
            {
                if (BoostLog.Instance != null)
                    BoostLog.Instance.WriteEntry(EventLogEntryType.Error, details, feedUrl);
            }
            #endif

            if (resultStream == null && tryCount < _config.MaxTries)
            {
                tryCount++;
                var msg = string.Format("Timeout. Retry {0} of {1}", tryCount, _config.MaxTries);
                progress.UpdateTable(-1, -1, msg);
            #if DEBUG
                if (BoostLog.Instance != null)
                    BoostLog.Instance.WriteEntry(EventLogEntryType.Warning, msg, feedUrl);
            #endif
                Thread.Sleep(_config.RetryDelay);
                return TryOpenRead(feedUrl, ref progress, tryCount);
            }
            return resultStream;
        }
 public CustomerHandler(SiteRules rules, CartExtractor cart, ExtractorProgress progress)
     : base(rules, cart, progress, DataGroup.Customers)
 {
 }