예제 #1
0
 public static void SaveIds(List <string> ids)
 {
     try
     {
         using (var context = new SCrawlerEntities())
         {
             var exportedIds = ids.Select(id => new ExportedProperties()
             {
                 ExportedPropertyId = id
             });
             context.ExportedProperties.AddRange(exportedIds);
             context.SaveChanges();
         }
     }
     catch (Exception ex)
     {
         Console.WriteLine($"Exception message: {ex.Message}, Inner Exception: {ex.InnerException}");
     }
 }
        /// <summary>
        /// Σκανάρει τη σελίδες αναζήτησης του spitogatos.gr
        /// </summary>
        /// <returns>Tα ids που δεν έχουμε ήδη κατεβάσει</returns>
        public async static Task <List <string> > GetNewPropertyIds(int pages, HttpClient client, string url)
        {
            var allPropertyIds = new List <string>();

            for (int i = 1; i <= pages; i++)
            {
                Console.WriteLine("Loading page " + i + " from " + pages);

                try
                {
                    var html = await client.GetStringAsync(new Uri(url + $"/offset_{i * 10}"));

                    //wait
                    int s = _random.Next(1, 2);
                    Thread.Sleep(s * 1000);

                    var htmlDocument = new HtmlDocument();
                    htmlDocument.LoadHtml(html);

                    //for each property on the page get the contents of media div
                    try
                    {
                        var mediaDivs = htmlDocument.DocumentNode.Descendants("div")
                                        .Where(id => id.GetAttributeValue("id", "").Contains("searchDetailsListings"))
                                        .SingleOrDefault()
                                        .SelectNodes("div/div")
                                        .Where(n => n.GetAttributeValue("class", "").Contains("media"))
                                        .ToList();

                        var notFromRealEstateHrefs = mediaDivs
                                                     //.Where(p => !p.SelectSingleNode("div/div/a[1]").GetAttributeValue("href", "").Contains("Κτηματομεσίτης"))
                                                     .Select(n => n.SelectSingleNode("a[1]").GetAttributeValue("href", ""))
                                                     .ToList();

                        //split href string on - and get the last part which is the property id
                        var pageItemIds = new List <string>();
                        foreach (var pageItemAnchor in notFromRealEstateHrefs)
                        {
                            var parts  = pageItemAnchor.Split('-');
                            int length = parts.Length;
                            pageItemIds.Add(parts[length - 1].Substring(1));
                        }

                        if (pageItemIds.Count() > 0)
                        {
                            allPropertyIds.AddRange(pageItemIds);
                        }
                    }
                    catch (Exception)
                    {
                        //this exception is thrown every time the program reaches a page with no properties
                        //no reason to continue searching subsequent pages
                        break;
                    }
                }
                catch (Exception ex)
                {
                    Console.WriteLine($"Exception message: {ex.Message}, Inner Exception: {ex.InnerException}");
                    throw;
                }
            }

            //select only non-exported propertyIds
            var newPropertyIds = new List <string>();

            try
            {
                using (var context = new SCrawlerEntities())
                {
                    var expoertedPropertyIds = context.ExportedProperties.Select(p => p.ExportedPropertyId).ToList();

                    newPropertyIds = allPropertyIds.Where(i => !expoertedPropertyIds.Contains(i)).ToList();
                }

                return(newPropertyIds);
            }
            catch (Exception ex)
            {
                Console.WriteLine($"Exception message: {ex.Message}, Inner Exception: {ex.InnerException}");
                throw;
            }
        }