Пример #1
0
 /// <summary>
 /// информация о сайте
 /// </summary>
 /// <param name="id">ид сайта</param>
 public async Task <Site> GetDetailSite(long id)
 {
     using (var db = new JoogleContext())
     {
         return(db.Sites.FirstOrDefault(x => x.Id == id));
     }
 }
Пример #2
0
 /// <summary>
 /// информация перед запуском парсера
 /// </summary>
 /// <returns></returns>
 public async Task <ParseResponse> InfoBeforeParse()
 {
     using (var db = new JoogleContext())
     {
         var model = new ParseResponse();
         model.SitesNotParsed = db.Sites.Where(x => !x.IsDeleted && !x.IsParsed).Count();
         model.Finished       = false;
         return(model);
     }
 }
Пример #3
0
 /// <summary>
 /// очистка базы данных
 /// </summary>
 /// <returns></returns>
 public async Task ClearDatabase()
 {
     try
     {
         using (var db = new JoogleContext())
         {
             db.Database.ExecuteSqlCommand("TRUNCATE TABLE [Sites]");
             db.Database.ExecuteSqlCommand("TRUNCATE TABLE [Texts]");
         }
     }
     catch { }
 }
Пример #4
0
        /// <summary>
        /// получить список сайтов
        /// </summary>
        /// <param name="model">модель списка сайтов</param>
        /// <param name="pageInfo">информация о странице</param>
        /// <returns>модель списка сайтов</returns>
        public async Task <SitesResponse> GetAllSites(SitesResponse model, PageInfo pageInfo)
        {
            using (var db = new JoogleContext())
            {
                var sites      = db.Sites.OrderByDescending(x => x.DateModify).Skip((pageInfo.PageNumber - 1) * pageInfo.PageSize).Take(pageInfo.PageSize).ToList();
                var countSites = db.Sites.Count();
                model.Sites = sites;
                model.PageInfo.TotalItems = countSites;

                return(model);
            }
        }
Пример #5
0
 /// <summary>
 /// изменить сайт
 /// </summary>
 /// <param name="site">сайт</param>
 public async Task EditSite(Site site)
 {
     using (var db = new JoogleContext())
     {
         var exist = db.Sites.FirstOrDefault(x => x.Id == site.Id);
         if (exist != null)
         {
             if (site.IsParsed)
             {
                 var texts = db.Texts.Where(x => x.SiteId == site.Id).ToList();
                 texts.ForEach(t => t.IsDeleted = site.IsDeleted);
             }
             exist.IsDeleted = site.IsDeleted;
             db.SaveChanges();
         }
     }
 }
Пример #6
0
        /// <summary>
        /// получить список текстов
        /// </summary>
        /// <param name="model">модель результата поиска</param>
        /// /// <param name="pageInfo">информация о странице</param>
        public async Task <TextsResponse> Search(TextsResponse model, PageInfo pageInfo)
        {
            using (var db = new JoogleContext())
            {
                var texts = db.Texts.Where(x => x.Title.Contains(model.Search))
                            .OrderByDescending(x => x.DateModify)
                            .Skip((pageInfo.PageNumber - 1) * pageInfo.PageSize)
                            .Take(pageInfo.PageSize)
                            .ToList();
                var countTexts = db.Texts.Where(x => x.Title.Contains(model.Search)).Count();
                model.Search = model.Search;
                model.Texts  = texts;
                model.PageInfo.TotalItems = countTexts;
                SubstringTexts(model);

                return(model);
            }
        }
Пример #7
0
 /// <summary>
 /// создать сайт
 /// </summary>
 /// <param name="request">модель создания сайта</param>
 public async Task CreateSite(CreateSiteRequest request)
 {
     using (var db = new JoogleContext())
     {
         var url   = request.Url.TrimEnd('/');
         var exist = db.Sites.FirstOrDefault(x => x.Url == url.ToLower());
         if (exist != null)
         {
             return;
         }
         var site = new Site
         {
             Url        = url.ToLower(),
             DateModify = DateTime.UtcNow
         };
         db.Sites.Add(site);
         db.SaveChanges();
     }
 }
Пример #8
0
 /// <summary>
 /// удалить сайт и все связанные текста
 /// </summary>
 /// <param name="site">сайт</param>
 public async Task DeleteSite(Site site)
 {
     try
     {
         using (var db = new JoogleContext())
         {
             var exist = db.Sites.FirstOrDefault(x => x.Id == site.Id);
             if (exist != null)
             {
                 var text = db.Texts.FirstOrDefault(x => x.SiteId == exist.Id);
                 if (text != null)
                 {
                     db.Texts.Remove(text);
                 }
                 db.Sites.Remove(exist);
                 db.SaveChanges();
             }
         }
     }
     catch { }
 }
Пример #9
0
        /// <summary>
        /// запуск парсера
        /// </summary>
        /// <param name="model">модель парсера</param>
        /// <returns></returns>
        public async Task <ParseResponse> StartParseAllSites(ParseResponse model)
        {
            using (var db = new JoogleContext())
            {
                var startTime      = DateTime.UtcNow;
                var sites          = db.Sites.Where(x => !x.IsDeleted && !x.IsParsed).ToList();
                int maxConcurrency = 20;
                using (SemaphoreSlim concurrencySemaphore = new SemaphoreSlim(maxConcurrency))
                {
                    List <Task> tasks = new List <Task>();
                    foreach (var site in sites)
                    {
                        concurrencySemaphore.Wait();
                        var t = Task.Factory.StartNew(() =>
                        {
                            try
                            {
                                SiteParse(site);
                            }
                            finally
                            {
                                concurrencySemaphore.Release();
                            }
                        });
                        tasks.Add(t);
                    }
                    Task.WaitAll(tasks.ToArray());
                    concurrencySemaphore.Dispose();
                }
                var endTime = DateTime.UtcNow;
                model.Sites          = sites.Count;
                model.Time           = endTime - startTime;
                model.Finished       = true;
                model.SitesNotParsed = db.Sites.Where(x => !x.IsDeleted && !x.IsParsed).Count();

                return(model);
            }
        }
Пример #10
0
        /// <summary>
        /// парсинг сайта
        /// </summary>
        /// <param name="obj">сайт</param>
        /// <returns></returns>
        private async Task SiteParse(Site obj)
        {
            try
            {
                using (var db = new JoogleContext())
                {
                    var site  = obj;
                    var exist = db.Sites.FirstOrDefault(x => x.Id == site.Id);
                    if (exist == null)
                    {
                        return;
                    }
                    var result = new StringBuilder();
                    var config = Configuration.Default.WithDefaultLoader();
                    var task   = BrowsingContext.New(config).OpenAsync(site.Url);
                    var html   = task.Result;

                    var hrefs = html.QuerySelectorAll("a")
                                .Where(x => x.Attributes["href"] != null)
                                .Select(x => x.Attributes["href"].Value)
                                .Distinct()
                                .ToList();

                    var selectors = html.QuerySelectorAll("h1, h2, h3, h4, p");
                    foreach (var selector in selectors)
                    {
                        result.Append(" ");
                        result.Append(selector.TextContent);
                        result.Append(" ");
                    }

                    var newSites = new List <Site>();
                    if (hrefs.Any())
                    {
                        hrefs.RemoveAll(x => !x.StartsWith("http"));
                        foreach (var href in hrefs)
                        {
                            var url      = href.Last() == '/' ? href.Remove(href.Length - 1).ToLower() : href.ToLower();
                            var existUrl = db.Sites.FirstOrDefault(x => x.Url == url);
                            if (existUrl != null || url == exist.Url)
                            {
                                continue;
                            }
                            newSites.Add(new Site
                            {
                                Url        = url,
                                DateModify = DateTime.UtcNow
                            });
                        }
                    }

                    if (!string.IsNullOrWhiteSpace(result.ToString()))
                    {
                        db.Texts.Add(new Text
                        {
                            SiteId     = exist.Id,
                            Url        = site.Url,
                            Title      = result.ToString(),
                            DateModify = DateTime.UtcNow
                        });
                    }

                    exist.IsParsed = true;
                    db.Sites.AddRange(newSites);
                    db.SaveChanges();
                }
            }
            catch { }
        }