Пример #1
0
        // GET: Store
        public ActionResult Index()
        {
            ModelContainer context = new ModelContainer();


            //getting configuration
            ScraperBotModel.Configuration configuration = context.Configurations.FirstOrDefault();

            DateTime outdatedDate = DateTime.Now.AddDays(-configuration.expirationTime);

            List <StoreTable> stores = context.Stores.Select(x => new StoreTable
            {
                storeName             = x.name,
                baseURL               = x.baseUrl,
                titleSelector         = x.titleSelector,
                originalPriceSelector = x.originalPriceSelector,
                salePriceSelector     = x.salePriceSelector,
                currency              = x.currency,
                badwords              = x.BadWords.Select(y => y.badWord).ToList(),
                productsFound         = x.Product.Count,
                crawled               = 100 - (100 * x.Link.Where(y => DbFunctions.TruncateTime(y.lastVisit) < DbFunctions.TruncateTime(outdatedDate)).Count() / x.Link.Count())
            }).ToList();

            return(View(stores));
        }
Пример #2
0
        private async void GetProducts(int idStore)
        {
            List <string> log = new List <string>();

            log.Add("=================================");
            log.Add(DateTime.Now.ToString());
            //connecting to DB
            ModelContainer context = new ModelContainer();

            //getting configuration
            ScraperBotModel.Configuration configuration = context.Configurations.FirstOrDefault();


            //Getting last link from only expired links list
            DateTime daysBefore = DateTime.Now.AddDays(-configuration.expirationTime);
            Link     link       = context.Links.Where(x => x.lastVisit < daysBefore && x.idStore == idStore)
                                  .OrderByDescending(x => x.addDate)
                                  .FirstOrDefault();

            if (link != null)
            {
                //Getting store infos based on the link
                Store store = context.Stores.Where(x => x.idStore == idStore)
                              .FirstOrDefault();

                log.Add("looking for a product at: " + link.link);


                //getting the document
                var document = await BrowsingContext.New(AngleSharp.Configuration.Default.WithDefaultLoader())
                               .OpenAsync(link.link);

                //getting title
                var titleSelector = document.QuerySelector(store.titleSelector);

                //List<string> productsType = new List<string> {
                //    "windows",
                //    "office",
                //    "microsoft",
                //    "word",
                //    "powerpoint",
                //    "excel",
                //    "outlook",
                //    "project",
                //    "visio",
                //    "SQL",
                //    "visual",
                //    "studio",
                //    "server",
                //    "access",
                //    "kaspersky",
                //    "mcafee",
                //    "antivirus",
                //    "parallels",
                //    "trend",
                //    "software",
                //    "license",
                //    "download",
                //    "norton"
                //};
                //bool isSoftware = false;

                //foreach (string type in productsType)
                //{
                //    if (titleSelector != null)
                //    {
                //        if (titleSelector.TextContent.Contains(type))
                //        {
                //            isSoftware = true;
                //        }
                //    }
                //}
                //get prices only if title has been found
                if (titleSelector != null)
                {
                    Product product = new Product
                    {
                        date    = DateTime.Now,
                        idStore = idStore,
                    };

                    //Feeding title
                    product.title = titleSelector.TextContent;

                    Regex regex = new Regex(configuration.priceRegex);

                    //Feeding Original price
                    var originalPriceSelector = document.QuerySelectorAll(store.originalPriceSelector)
                                                .Where(x => regex.Match(x.TextContent)
                                                       .Success)
                                                .Select(x => regex.Match(x.TextContent)
                                                        .Groups[1].Value)
                                                .Min();

                    if (originalPriceSelector != null)
                    {
                        product.originalPrice = Parse(originalPriceSelector);
                    }

                    //Feeding Sale price
                    var salePriceSelector = document.QuerySelectorAll(store.salePriceSelector)
                                            .Where(x => regex.Match(x.TextContent)
                                                   .Success)
                                            .Select(x => regex.Match(x.TextContent)
                                                    .Groups[1].Value)
                                            .Min();

                    if (salePriceSelector != null)
                    {
                        product.salePrice = Parse(salePriceSelector) ?? 0;
                    }
                    DateTime date = DateTime.Now;
                    if (context.Products.Where(x => x.title == product.title && DbFunctions.TruncateTime(x.date) == DbFunctions.TruncateTime(date) && x.idStore == idStore).Count() == 0)
                    {
                        log.Add("Product has been found: " + product.title + " | Price: " + ((product.salePrice == 0 || product.salePrice == null) ? product.originalPrice : product.salePrice));
                        //adding this product to the database
                        context.Products.Add(product);
                    }
                }



                //getting all links from this page
                List <string> allLinks      = context.Links.Select(x => x.link).ToList();
                Regex         regexLink     = new Regex(@"^(http:\/\/|https:\/\/)[w]{0,3}\.?" + store.baseUrl);
                List <Link>   AllLinksFound = new List <Link>();

                List <string> badWords = context.BadWords.Where(x => x.idStore == store.idStore).Select(x => x.badWord).ToList();

                Regex regexBadWords = new Regex(string.Join("|", badWords));

                List <string> linksFound = document.QuerySelectorAll("a")
                                           .Where(x => (!string.IsNullOrEmpty(x.GetAttribute("href")) &&
                                                        regexLink.Match(x.GetAttribute("href")).Success &&
                                                        x.GetAttribute("href").Contains(store.baseUrl) &&
                                                        !regexBadWords.Match(x.GetAttribute("href")).Success) ||
                                                  (!string.IsNullOrEmpty(x.GetAttribute("href")) &&
                                                   !x.GetAttribute("href").Contains("http") &&
                                                   !x.GetAttribute("href").Contains("//"))
                                                  )
                                           .GroupBy(x => x.GetAttribute("href"))
                                           .Select(x => x.Key.ToString())
                                           .ToList();

                foreach (string linkFound in linksFound)
                {
                    string linkFormated = (!linkFound.Contains("http")) ? "https://" + store.baseUrl + linkFound : linkFound;

                    if (allLinks.Where(x => x == linkFormated) == null || allLinks.Where(x => x == linkFormated).Count() == 0)
                    {
                        Link newLink = new Link
                        {
                            addDate   = DateTime.Now,
                            idStore   = store.idStore,
                            lastVisit = DateTime.Now.AddDays(-5),
                            status    = true,
                            link      = linkFormated
                        };
                        AllLinksFound.Add(newLink);
                    }
                }
                if (AllLinksFound.Count > 0)
                {
                    //Adding all links that was found to the DB
                    context.Links.AddRange(AllLinksFound);
                }
                log.Add("New links found: " + AllLinksFound.Count);

                link.lastVisit = DateTime.Now;

                context.SaveChanges();

                Thread.Sleep(configuration.serviceInterval * 1000);
            }
            else
            {
                Thread.Sleep(60000);
            }
            log.Add(" ");
            Console.WriteLine(string.Join("\r\n", log));
            GetProducts(idStore);
        }