Пример #1
0
        static void WriteOutput(Data v)
        {
            // string header = "Destination Country,City,Restaurant list page,Restaurant Name,Address,cuisine,No of reviewes,Url,Excellent,Very Good,Average,Poor,Terrible,Food,Value,Services,Atmosphere,Topic of commants,Name of Reviewer,Review Date,Reviewer location, Reviewer Country,level,profile";
            var op = (
                      from c in v.Reviewes
                      select new object[]
                      {
                            v.Country.ToFormatString(),
                           v.City.ToFormatString(),
                            v.ListUrl.ToFormatString(),
                            v.Name.ToFormatString(),
                          v.Address.ToFormatString(),
                          v.cuisine.ToFormatString() ,
                           v.Counts.ToFormatString(),
                           c.Url.ToFormatString(),
                           v.Excellent.ToFormatString(),
                           v.VeryGood.ToFormatString(),
                           v.Average.ToFormatString(),
                           v.Poor.ToFormatString(),
                           v.Terrible.ToFormatString(),
                           v.Food.ToFormatString(),
                          v.Value.ToFormatString(),
                          v.Service.ToFormatString(),
                           v.Atmosphere.ToFormatString(),
                           c.ReviewQuote.ToFormatString(),

                           c.Name.ToFormatString() ,
                           c.Date.ToFormatString(),
                           c.City.ToFormatString(),
                           c.Country.ToFormatString(),
                           c.ReviererLevel.ToFormatString(),
                           c.ReviewerProfileUrl.ToFormatString()

                      }).ToList();
            // Build the file content
            var csv = new StringBuilder();
            // csv.AppendLine(header);
            op.ForEach(line =>
            {
                csv.AppendLine(string.Join(",", line));
            });
            try
            {
                File.AppendAllText("c:\\build80_9_8_2015.csv", csv.ToString());
            }
            catch
            {
                Console.WriteLine("write error");
            }
        }
Пример #2
0
        static void Main(string[] args)
        {
            int i = 0;
            var listHotels = new System.Collections.Concurrent.ConcurrentBag<Data>();
            /* BAliurl  * */
            var extUrl = "/RestaurantSearch?ajax=0&geo=294226&Action=PAGE&o=a{0}&etags=9910%2C9911%2C9909%2C9901%2C9899%2C9900";

              //  var extUrl = "/RestaurantSearch?ajax=0&geo=294265&Action=PAGE&o=a{0}&etags=9909%2C9899%2C9901%2C9900%2C9910%2C9911";
            var nods = Enumerable.Range(1, 7110).Where(p => p % 30 == 0);
            //var nods = Enumerable.Range(1, 40).Where(p => p % 30 == 0);
            Parallel.ForEach(nods, o =>
               // {

               //});
               //for (var o = 0; o <= 3480; )
               {
               try
               {

                   HtmlDocument doc = null;
                   for (var ic = 0; ic < 10 && doc == null; ic++)
                   {
                       // HtmlWeb web = new HtmlWeb();
                       // web.PreRequest += new HtmlWeb.PreRequestHandler(onPrereq);
                       try
                       {
                           doc = GetDoc(base_url + string.Format(extUrl, o));

                       }
                       catch (Exception e)
                       {
                           Console.WriteLine("error to load " + base_url + string.Format(extUrl, o) + " " + ic);
                           System.Threading.Thread.Sleep(1000);
                       }
                   }
                   //var nodes = doc.DocumentNode.Descendants().Where(p =>x p.Name == "div").Where(o => o.Id.Contains("hotel_")).Where(i => i.Attributes.FirstOrDefault(n => n.Value == "listing_title") != null);//.Where(p => p.Attributes.Contains("class='listing_info popIndexValidation'"));
                   if (doc != null)
                   {
                       var nodes = doc.DocumentNode.Descendants().Where(p => p.Name == "h3" && p.Attributes.FirstOrDefault(n => n.Value == "title") != null);//.Where(p => p.Attributes.Contains("class='listing_info popIndexValidation'"));
                       //  Parallel.ForEach(nodes, item =>
                       foreach (var item in nodes)
                       {
                           var test = item.ChildNodes.Where(p => p.NodeType == HtmlNodeType.Element && p.Name == "a").FirstOrDefault();
                           if (test != null)
                           {
                               var d = new Data();

                               d.Name = test.InnerText;

                               var url = test.Attributes.FirstOrDefault(p => p.Name == "href");
                               if (url != null)
                               {
                                   d.ID = ++i;
                                   d.url = url.Value;

                               }
                               d.ListUrl = base_url + d.url;// "/Restaurants-g294226-Bali.html";
                               listHotels.Add(d);
                               LoadReview(d);
                               WriteOutput(d);
                               Console.WriteLine(d.ToString());
                           }
                       }
                       //);
                   }
               }
               catch { }
               // j += 30;
               }
            );
            WriteOutput(listHotels.ToList());
            // HtmlNodeCollection tags = doc.DocumentNode.SelectNodes("//abc//tag");
        }
Пример #3
0
        static void GetReview(Data d, HtmlDocument doc, string url = "")
        {
            var cuisine = doc.DocumentNode.Descendants().FirstOrDefault(p => p.Attributes.FirstOrDefault(o => o.Name == "class" && o.Value == "cuisine") != null);

            if (cuisine != null)
            {
                d.cuisine = cuisine.InnerText.Trim();
            }

            var address = doc.DocumentNode.Descendants().FirstOrDefault(p => p.Attributes.FirstOrDefault(o => o.Name == "class" && o.Value == "format_address") != null);
            if (address != null)
            {
                d.Address = address.InnerText.Trim();
            }
            var lblratings = doc.DocumentNode.Descendants().Where(p => p.Name == "div" && p.Attributes.FirstOrDefault(o => o.Name == "class" && o.Value == "ratingRow wrap") != null);
            foreach (var item in lblratings)
            {
                var text = item.Descendants().FirstOrDefault(p => p.Name == "span" && p.Attributes.FirstOrDefault(o => o.Name == "class" && o.Value == "text") != null);
                var value = item.Descendants().FirstOrDefault(p => p.Name == "img" && p.Attributes.FirstOrDefault(o => o.Name == "class" && o.Value.Contains("sprite-rating_s_fill rating_s_fill")) != null);
                if (text != null && value != null)
                {
                    if (text.InnerText.Trim() == "Food")
                    {
                        var valx = value.Attributes.FirstOrDefault(p => p.Name == "alt");
                        if (valx != null)
                        {
                            d.Food = valx.Value;
                        }
                    }
                    else if (text.InnerText.Trim() == "Value")
                    {
                        var valx = value.Attributes.FirstOrDefault(p => p.Name == "alt");
                        if (valx != null)
                        {
                            d.Value = valx.Value;
                        }
                    }
                    else if (text.InnerText.Trim() == "Service")
                    {
                        var valx = value.Attributes.FirstOrDefault(p => p.Name == "alt");
                        if (valx != null)
                        {
                            d.Service = valx.Value;
                        }
                    }
                    else if (text.InnerText.Trim() == "Atmosphere")
                    {
                        var valx = value.Attributes.FirstOrDefault(p => p.Name == "alt");
                        if (valx != null)
                        {
                            d.Atmosphere = valx.Value;
                        }
                    }
                }
            }
            var nodes = doc.DocumentNode.Descendants().FirstOrDefault(p => p.Name == "address");//.Where(p => p.Name == "div" && p.Attributes.FirstOrDefault(n => n.Value == "listing_title") != null);//.Where(p => p.Attributes.Contains("class='listing_info popIndexValidation'"));
            var tab = doc.DocumentNode.Descendants().FirstOrDefault(p => p.Id == "TABS_REVIEWS");
            if (tab != null)
            {
                var counts = tab.ChildNodes.FirstOrDefault(p => p.Attributes.FirstOrDefault(o => o.Name == "class" && o.Value == "tabs_pers_counts") != null);
                if (counts != null)
                {
                    d.Counts = counts.InnerText.Substring(1);
                }
            }
            //review rating
            var frm = doc.DocumentNode.Descendants().FirstOrDefault(p => p.Attributes.FirstOrDefault(o => o.Value == "histogramCommon simpleHistogram wrap") != null);
            if (frm != null)
            {
                var ratings = frm.Descendants().Where(o => o.Name == "li");//.Where(p => p.Attributes.FirstOrDefault(o => o.Value == "wrap row" && o.Name == "class") != null);
                foreach (var rating in ratings)
                {
                    var FirstChild = rating.ChildNodes.FirstOrDefault(p => p.NodeType == HtmlNodeType.Element && p.Name == "div" && p.Attributes.FirstOrDefault(s => s.Value == "label fl part clickable") != null);
                    var LastChild = rating.ChildNodes.FirstOrDefault(p => p.NodeType == HtmlNodeType.Element && p.Name == "div" && p.Attributes.FirstOrDefault(s => s.Value == "valueCount fr part") != null);
                    if (FirstChild != null && LastChild != null)
                    {
                        if (FirstChild.InnerText.Trim() == "Excellent")
                        {
                            d.Excellent = LastChild.InnerText;
                        }
                        else if (string.Compare(FirstChild.InnerText.Trim(), "Very Good", true) == 0)
                        {
                            d.VeryGood = LastChild.InnerText;
                        }
                        else if (FirstChild.InnerText.Trim() == "Average")
                        {
                            d.Average = LastChild.InnerText;
                        }
                        else if (FirstChild.InnerText.Trim() == "Poor")
                        {
                            d.Poor = LastChild.InnerText;
                        }
                        else if (FirstChild.InnerText.Trim() == "Terrible")
                        {
                            d.Terrible = LastChild.InnerText;
                        }
                    }
                }
            }
            //if (nodes != null)
            //{
            //    var add = nodes.Descendants().FirstOrDefault(p => p.Attributes.FirstOrDefault(o => o.Name == "property" && o.Value == "address") != null);
            //    if (add != null)
            //    {
            //        d.Address = add.InnerText;
            //    }
            //}
            var revws = doc.DocumentNode.Descendants().Where(p => p.Id.Contains("review_") && p.Attributes.FirstOrDefault(o => o.Value == "reviewSelector  ") != null);

            foreach (var item in revws)
            {
                var r = new Review();
                var quoteContainer = item.Descendants().FirstOrDefault(p => p.Attributes.FirstOrDefault(o => o.Name == "class" && o.Value.Contains("quote")) != null);
                if (quoteContainer != null)
                {
                    var urlTag = quoteContainer.Descendants().FirstOrDefault(p => p.Name == "a");
                    if (urlTag != null)
                    {
                        var urlAttr = urlTag.Attributes.FirstOrDefault(p => p.Name == "href");
                        if (urlAttr != null)
                        {
                            r.Url = base_url + urlAttr.Value;
                        }

                    }

                }
                var Name = item.Descendants().FirstOrDefault(p => p.Attributes.FirstOrDefault(o => o.Name == "class" && o.Value == "username mo") != null);
                var quote = item.Descendants().FirstOrDefault(p => p.Name == "span" && p.Attributes.FirstOrDefault(o => o.Name == "class" && o.Value == "noQuotes") != null);
                var avgRate = item.Descendants().FirstOrDefault(p => p.Name == "img" && p.Attributes.FirstOrDefault(o => o.Name == "class" && o.Value == "sprite-rating_s_fill rating_s_fill s50") != null);
                var loc = item.Descendants().FirstOrDefault(p => p.Name == "div" && p.Attributes.FirstOrDefault(o => o.Name == "class" && o.Value == "location") != null);
                var dt = item.Descendants().FirstOrDefault(p => p.Name == "span" && p.Attributes.FirstOrDefault(o => o.Name == "class" && o.Value == "ratingDate") != null);

                var ProfileDive = item.Descendants().FirstOrDefault(p => p.Name == "div" && p.Attributes.FirstOrDefault(o => o.Name == "class" && o.Value == "memberOverlayLink") != null);

                if (ProfileDive != null)
                {
                    var temp = ProfileDive.Id.Split('-');
                    if (temp.Length >= 2)
                    {
                        string uid = string.Empty, src = string.Empty;
                        var arrUid = temp[0].Split('_');
                        if (arrUid.Length >= 2)
                        {
                            uid = arrUid[1];
                        }
                        var arrSrc = temp[1].Split('_');
                        if (arrSrc.Length >= 2)
                        {
                            src = arrSrc[1];
                        }
                        var s = "/MemberOverlay?uid={0}&c=&src={1}&fus=false&partner=false&LsoId=";
                        if (!string.IsNullOrWhiteSpace(src) && !string.IsNullOrWhiteSpace(uid))
                        {
                            HtmlDocument dc = null;
                            // HtmlWeb web = new HtmlWeb();
                            ///  web.PreRequest += new HtmlWeb.PreRequestHandler(onPrereq);
                            for (var inl = 0; inl < 10 && dc == null; inl++)
                            {
                                try
                                {
                                    dc = GetDoc(base_url + string.Format(s, uid, src));
                                }
                                catch
                                {
                                    System.Threading.Thread.Sleep(10000);
                                }
                            }

                            if (dc != null)
                            {

                                var profLvlDiv = dc.DocumentNode.Descendants().FirstOrDefault(p => p.Name == "div" && p.Attributes.FirstOrDefault(o => o.Name == "class" && o.Value == "badgeinfo") != null);
                                if (profLvlDiv != null)
                                {
                                    var profLvlSpan = profLvlDiv.Descendants().FirstOrDefault(p => p.Name == "span");
                                    if (profLvlSpan != null)
                                    {
                                        r.ReviererLevel = profLvlSpan.InnerText;
                                    }
                                }
                                var profUrlDiv = dc.DocumentNode.Descendants().FirstOrDefault(p => p.Name == "div" && p.Attributes.FirstOrDefault(o => o.Name == "class" && o.Value == "baseNav") != null);
                                if (profUrlDiv != null)
                                {
                                    var profLvlan = profUrlDiv.Descendants().LastOrDefault(p => p.Name == "a");
                                    if (profLvlan != null)
                                    {
                                        var href = profLvlan.Attributes.FirstOrDefault(x => x.Name == "href");
                                        if (href != null)
                                        {
                                            r.ReviewerProfileUrl = base_url + "/" + href.Value;
                                        }
                                    }
                                }

                            }

                        }
                    }
                }
                if (dt != null)
                {
                    r.Date = dt.InnerText.Replace("Reviewed", "").Replace("NEW", "").Trim();
                }
                if (loc != null)
                {
                    var ci = loc.InnerText.Split(',');
                    if (ci.Length > 1)
                    {
                        r.City = ci[0];
                        r.Country = ci[1];
                    }
                }
                if (avgRate != null)
                {
                    var rt = avgRate.Attributes.FirstOrDefault(o => o.Name == "alt");
                    if (rt != null)
                    {
                        r.AvgReview = rt.Value;

                    }
                }
                if (Name != null)
                    r.Name = Name.InnerText.Trim();

                if (quote != null)
                {
                    r.ReviewQuote = quote.InnerText;
                }

                d.Reviewes.Add(r);
            }
        }
Пример #4
0
        static void LoadReview(Data d)
        {
            try
            {

                HtmlDocument doc = null;
                for (var x = 0; x < 10 && doc == null; x++)
                {
                    try
                    {
                        doc = GetDoc(base_url + d.url);
                    }
                    catch
                    {
                        Console.WriteLine("error to load " + base_url + d.url + " " + x);
                        System.Threading.Thread.Sleep(10000);
                    }
                }
                if (doc != null)
                {
                    var pageDiv = doc.DocumentNode.Descendants().FirstOrDefault(p => p.Attributes.FirstOrDefault(o => o.Name == "class" && o.Value == "pageNumbers") != null);
                    if (pageDiv != null)
                    {
                        var rvPagecount = pageDiv.ChildNodes.Where(o => o.NodeType == HtmlNodeType.Element && o.Name == "a").LastOrDefault();
                        GetReview(d, doc);
                        if (rvPagecount != null)
                        {
                            int count = 0;

                            int.TryParse(rvPagecount.InnerText, out count);
                            if (count > 1)
                            {
                                var cn = Enumerable.Range(1, count);
                                Parallel.ForEach(cn, p =>
                               ///for (var p = 0; p < count; p++)
                               {
                                   var url = d.url.Replace("Reviews-", "Reviews-or" + p * 10 + "-");
                                   try
                                   {
                                       HtmlDocument dc = null;
                                       for (var inl = 0; inl < 10 && dc == null; inl++)
                                       {
                                           // HtmlWeb web = new HtmlWeb();
                                           //web.PreRequest += new HtmlWeb.PreRequestHandler(onPrereq);
                                           try
                                           {
                                               dc = GetDoc(base_url + url);
                                           }
                                           catch
                                           {
                                               Console.WriteLine("error to load " + base_url + d.url + " " + inl);
                                               System.Threading.Thread.Sleep(10000);
                                           }
                                       }
                                       if (dc != null)
                                           GetReview(d, dc, base_url + url);
                                   }
                                   catch (Exception e)
                                   {
                                       Console.WriteLine(e.Message);
                                   }
                               }
                               );
                            }
                            //for (int i = 1; i < count; i++)
                            //{
                            //    var url = d.url.Replace("Reviews-", "Reviews-or" + i * 10 + "-");
                            //    HtmlDocument dc = web.Load(base_url + url);
                            //    GetReview(d, dc);
                            //}

                        }
                    }
                }
                else
                {

                }
            }
            catch (Exception e)
            {
                Console.WriteLine(e.Message);
            }
        }
Пример #5
0
 static void GetIndividualReviews(Data d)
 {
 }