/// <summary>
        /// Checks if the attribute ("грн" for instance) is contained only once in the node
        /// </summary>
        /// <param name="node"></param>
        /// <returns></returns>
        private static bool TryGetSingleOccurance(HtmlNode node, ParseDomainParams pdp, out CurrencyRegexes currency, out Regex regex)
        {
            regex    = null;
            currency = null;

            bool found = false;

            foreach (var cur in pdp.CurRegexes)
            {
                foreach (var reg in cur.Regexes)
                {
                    var matches = reg.Matches(node.InnerText).Count;
                    if (matches > 1)
                    {
                        return(false);
                    }
                    if (matches == 1)
                    {
                        if (found)
                        {
                            return(false);
                        }

                        regex    = reg;
                        currency = cur;

                        found = true;
                    }
                }
            }

            return(found);
        }
        /// <summary>
        /// Checks if the attribute ("грн" for instance) is contained once or more in the node
        /// </summary>
        /// <param name="node"></param>
        /// <returns></returns>
        private static int TryGetAnyOccurance(HtmlNode node, ParseDomainParams pdp, out CurrencyRegexes currency, out Regex regex)
        {
            regex    = null;
            currency = null;

            int foundCount = 0;

            foreach (var cur in pdp.CurRegexes)
            {
                foreach (var reg in cur.Regexes)
                {
                    var matches = reg.Matches(node.InnerText).Count;
                    foundCount += matches;
                    if (matches > 0)
                    {
                        regex    = reg;
                        currency = cur;
                    }
                    if (foundCount > 1)
                    {
                        return(foundCount);
                    }
                }
            }

            return(foundCount);
        }
Esempio n. 3
0
        public ShopWrapper(ParseDomainParams pdp)
        {
            this.pdp = pdp;

            //this.shortDomain = GetDomain.GetDomainFromUrl(domainName);
            links.Add(pdp.Domain);
        }
        /// <summary>
        /// Gets all products from page using SearchPriceKind.spkOuter
        /// </summary>
        /// <param name="node">node</param>
        /// <param name="pdp">parse params</param>
        /// <returns></returns>
        private static IEnumerable <Product> GetProductsOuter(this HtmlNode node, ParseDomainParams pdp)
        {
            HashSet <Product> res = new HashSet <Product>();

            if (node.Name == "script")
            {
                return(res);
            }
            int ocCount = GetNodePriceCount(node, pdp);

            if (ocCount > 1)
            {
                if (node.ChildNodes.Count > 1)
                {
                    List <Task> tasks   = new List <Task>();
                    object      lockObj = new object();
                    foreach (var child in node.ChildNodes)
                    {
                        Task t = new Task(() =>
                        {
                            var childRes = GetProductsOuter(child, pdp);

                            foreach (var p in childRes)
                            {
                                res.LAdd(p, lockObj);
                            }
                        });
                        t.Start();
                        tasks.Add(t);
                    }
                    Task.WaitAll(tasks.ToArray());
                }
                else
                {
                    foreach (var p in GetProductsOuter(node.FirstChild, pdp))
                    {
                        res.Add(p);
                    }
                }
            }
            else if (ocCount == 1)
            {
                //Creating single product:
                var urls = node.GetImageURLs();

                if (TryGetSingleOccurance(node, pdp, out CurrencyRegexes currency, out Regex regex))
                {
                    var product = BuildProduct(node, pdp, regex, urls, currency);
                    if (product != null)
                    {
                        res.Add(product);
                    }
                }
            }

            return(res);
        }
        public void GetProducts()
        {
            var pdp    = new ParseDomainParams("http://via.placeholder.com/", new HashSet <string>(), new HashSet <string>(), new string[] { "`" }, "", AgilityPackClasses.DescriptionGetKind.dgkLongest, AgilityPackClasses.SearchPriceKind.spkInner);
            var result = controller.GetProducts(pdp);
            var js     = new JavaScriptSerializer();

            Assert.AreEqual(js.Serialize(result.First()), js.Serialize(context.Products.First()));
            Assert.AreEqual(result.Count(), context.Products.Count());
        }
Esempio n. 6
0
        public IEnumerable <Product> GetProducts([FromBody] ParseDomainParams pdp)
        {
            if (pdp != null)
            {
                System.Diagnostics.Debug.WriteLine("Parse enums:" + pdp.DescriptionGetKind + " " + pdp.SearchPriceKind);
            }
            //var pdp1 = new ParseDomainParams("https://www.olx.ua", new HashSet<string>(), new HashSet<string>(), new string[] { "" }, "", 0, 0);
            if (pdp != null && Uri.TryCreate(pdp.Domain, UriKind.Absolute, out Uri uri))
            {
                var shopWrapper = new ShopWrapper(pdp);

                var products = shopWrapper.WrapShop();

                foreach (var product in products)
                {
                    //adding new images to database:

                    foreach (var image in product.Images)
                    {
                        db.Images.Add(image);
                    }

                    //TODO: Add try catch exception
                    var p = db.Products.Any() ? db.Products.Where(x => x.Description == product.Description && x.Domain == product.Domain && x.Currency.Code == product.Currency.Code).SingleOrDefault() : null;

                    if (p != null)
                    {
                        db.LoadConnections(p);

                        List <Image> oldImages = p.Images.ToList();

                        oldImages.ForEach(img => p.Images.Remove(img));

                        foreach (var img in product.Images)
                        {
                            p.Images.Add(img);
                        }

                        p.DeltaPrice = p.Price - product.Price;
                        p.Price      = product.Price;
                        p.Currency   = db.FindCurrency(p.Currency.Code);
                        db.Products.Attach(p);
                    }
                    else
                    {
                        product.Currency = db.FindCurrency(product.Currency.Code);
                        db.Products.Add(product);
                    }
                }
                db.SaveChanges();
            }
            var list = db.Products.ToList();

            list.ForEach(x => db.LoadCurrency(x));
            return(list);
        }
 /// <summary>
 /// Gets all products from page
 /// </summary>
 /// <param name="doc">document</param>
 /// <param name="pdp">Parse domain params</param>
 /// <returns></returns>
 public static IEnumerable <Product> GetProducts(this HtmlDocument doc, ParseDomainParams pdp)
 {
     if (pdp.SearchPriceKind == SearchPriceKind.spkInner)
     {
         return(doc.GetProductsInner(pdp));
     }
     else
     {
         return(doc.DocumentNode.GetProductsOuter(pdp));
     }
 }
        /// <summary>
        /// Checks number of occurences of the attribute ("грн" for instance) in the node
        /// </summary>
        /// <param name="node"></param>
        /// <returns></returns>
        private static int GetNodePriceCount(HtmlNode node, ParseDomainParams pdp)
        {
            int res = 0;

            foreach (var cur in pdp.CurRegexes)
            {
                foreach (var reg in cur.Regexes)
                {
                    res += reg.Matches(node.InnerText).Count;
                }
            }

            return(res);
        }
        /// <summary>
        /// Finds price value
        /// </summary>
        /// <param name="node"></param>
        /// <returns></returns>
        private static string FindPrice(HtmlNode node, Regex curRegex, ParseDomainParams pdp)
        {
            var v = curRegex.Match(node.InnerText).Value;

            if (v != "")
            {
                var s = pdp.PriceRegex.Match(v).Value;
                s = Currencies.CutPrice(s, pdp.DecimalSeparator);
                if (pdp.DecimalSeparator != "")
                {
                    s = s.Replace(pdp.DecimalSeparator, Thread.CurrentThread.CurrentCulture.NumberFormat.CurrencyDecimalSeparator);
                }
                return(s);
            }

            return("");
        }
        public void GetProductsHashSet_OuterFull()
        {
            var pdp = new ParseDomainParams("http://via.placeholder.com/", new HashSet <string>()
            {
                " deleteWord", "\t"
            }, new HashSet <string>()
            {
                "неликвидный"
            }, currencySeparators, decimalSeparator, AgilityPackClasses.DescriptionGetKind.dgkFull, AgilityPackClasses.SearchPriceKind.spkOuter);
            var products = doc.GetProducts(pdp);

            Debug.WriteLine(js.Serialize(products));
            Debug.WriteLine(js.Serialize(prTest));
            Assert.IsNotNull(products);
            List <Product> pList = products.ToList();

            pList.Sort((x, y) =>
                       x.Price.CompareTo(y.Price));
            Assert.AreEqual(js.Serialize(pList), js.Serialize(prTest));
        }
        /// <summary>
        /// Gets the longest value of InnerText of object which doesn't contain children with non-empty innerText
        /// </summary>
        /// <param name="node">node</param>
        /// <param name="pdp">node</param>
        /// <param name="curRegex">Currency Regex</param>
        /// <returns>The longest inner text</returns>
        private static string GetLongestInnerText(this HtmlNode node, ParseDomainParams pdp, Regex curRegex)
        {
            var sLongest = "";

            foreach (var child in node.ChildNodes)
            {
                var s = child.GetLongestInnerText(pdp, curRegex);
                if (s.Length > sLongest.Length)
                {
                    sLongest = s;
                }
            }

            if (sLongest.Length == 0)
            {
                var res        = node.InnerText;
                int priceCount = curRegex.Matches(res).Count;

                if (priceCount > 1)
                {
                    return("");
                }

                if (priceCount == 1)
                {
                    var replace = curRegex.Match(res).Value;
                    if (replace == "")
                    {
                        return("");
                    }
                    res = res.Replace(replace, "");
                }

                return(res);
            }
            return(sLongest);
        }
        /// <summary>
        /// Get product from node data
        /// </summary>
        /// <param name="node">node</param>
        /// <param name="pdp">Parse params</param>
        /// <param name="curRegex">currency Regex</param>
        /// <param name="urls">urls of images</param>
        /// <returns></returns>
        private static Product BuildProduct(HtmlNode node, ParseDomainParams pdp, Regex curRegex, IEnumerable <string> urls, Currency currency)
        {
            //Parsing price value from node
            string price = FindPrice(node, curRegex, pdp);

            HashSet <Image> images = new HashSet <Image>();

            foreach (var u in urls)
            {
                var img = StringUtils.ImageRegexes.BuildImage(u, pdp.Domain);
                if (img != null)
                {
                    images.Add(img);
                }
            }

            string description = node.GetProductDescription(curRegex, pdp);

            if (description != "")
            {
                return(new Product(pdp.Domain, description, Decimal.Parse(price, System.Globalization.NumberStyles.Currency), images, currency));
            }
            return(null);
        }
        /// <summary>
        /// Gets all products from page using SearchPriceKind.spkInner
        /// </summary>
        /// <param name="node">node</param>
        /// <param name="pdp">parse params</param>
        /// <returns></returns>
        private static IEnumerable <Product> GetProductsInner(this HtmlDocument doc, ParseDomainParams pdp)
        {
            HashSet <Product> products = new HashSet <Product>();

            //Seaches nodes which contain attribute("грн" for instance) only once
            HashSet <NodeCurrencyInfo> currencyNodeInfos = GetInnerNodesContainingSinglePrice(doc.DocumentNode, pdp);

            foreach (var cni in currencyNodeInfos)
            {
                var node = cni.Node;
                IEnumerable <string> urls;

                //Moving each node from child to parent, until al least one picture is found.
                //If node contains more than one price - cycle is aborted.
                do
                {
                    urls = node.GetImageURLs();

                    //if in one segment there is more than one price or no prices - node doesn't contain product
                    if (GetNodePriceCount(node, pdp) != 1)
                    {
                        break;
                    }

                    if (urls.Count() > 0)
                    {
                        products.Add(BuildProduct(node, pdp, cni.CurrencyRegex, urls, cni.Currency));

                        break;
                    }

                    node = node.ParentNode;
                } while (node != null);
            }
            return(products);
        }
        private static string GetProductDescription(this HtmlNode node, Regex curRegex, ParseDomainParams pdp)
        {
            string description;

            if (pdp.DescriptionGetKind == DescriptionGetKind.dgkLongest)
            {
                description = node.GetLongestInnerText(pdp, curRegex);
            }
            else //if(pdp.DescriptionGetKind==DescriptionGetKind.dgkFull)
            {
                description = node.InnerText;

                string match = curRegex.Match(description).Value;

                description = description.Replace(match, "");
            }

            foreach (var s in pdp.SkipDecriptionWhenFound)
            {
                if (description.Contains(s))
                {
                    return("");
                }
            }

            foreach (var s in pdp.ReplaceInDecription)
            {
                description = description.Replace(s, "");
            }

            return(description);
        }
        /// <summary>
        /// Get all nodes containing price attribute ("грн" for instance) only once.
        /// </summary>
        /// <param name="node"></param>
        /// <returns></returns>
        private static HashSet <NodeCurrencyInfo> GetInnerNodesContainingSinglePrice(HtmlNode node, ParseDomainParams pdp)
        {
            HashSet <NodeCurrencyInfo> res = new HashSet <NodeCurrencyInfo>();

            if (node.Name == "script")
            {
                return(res);
            }
            List <Task> tasks        = new List <Task>();
            object      lockObj      = new object();
            int         matchesCount = TryGetAnyOccurance(node, pdp, out CurrencyRegexes currency, out Regex regex);

            if (matchesCount > 0)
            {
                foreach (var child in node.ChildNodes)
                {
                    Task t = new Task(() =>
                    {
                        HashSet <NodeCurrencyInfo> currencyInfos = GetInnerNodesContainingSinglePrice(child, pdp);
                        foreach (var c in currencyInfos)
                        {
                            res.LAdd(c, lockObj);
                        }
                    });
                    t.Start();
                    tasks.Add(t);
                }

                Task.WaitAll(tasks.ToArray());

                if (res.Count() == 0 && matchesCount == 1)
                {
                    res.Add(new NodeCurrencyInfo(node, currency, regex));
                }
            }
            return(res);
        }