Пример #1
0
        public async Task <List <CrawlerPage> > CollectLinks(string URL)
        {
            var bot = new AbotProvider();

            return(await bot.CollectLinks(URL));
        }
Пример #2
0
        public bool WidgetAutoGenerationProcess(string UserId, List <string> URLs)
        {
            var brandSettingsBLL     = new BrandSettingsBLL();
            var widgetsBLL           = new WidgetsBLL();
            var contentsBLL          = new ContentsBLL();
            var tagsBLL              = new TagsBLL();
            var predictiveContentBLL = new PredictiveContentBLL();
            var dropZoneBLL          = new DropZoneBLL();

            var user          = new UserDataSource().FindAspNetUserById(UserId);
            var brandSettings = brandSettingsBLL.GetBrandSettingsByUser(user).FirstOrDefault();
            var processId     = widgetsBLL.CreateCrawlHistory(user.Id, brandSettings.Id);

            var numOfValidWidgets     = 0;
            var numOfWidgets          = 0;
            var DISPLAY_WIDGETS_COUNT = 10;
            var MIN_WORDS_COUNT       = 100;

            try
            {
                var contentPatterns = predictiveContentBLL.GetByBrand(brandSettings.Id);

                var pagesFilters = new List <ICrawlerFilter>();
                foreach (var contentPattern in contentPatterns.Where(x => x.IsScannable))
                {
                    pagesFilters.Add(new CrawlerWildcardFilter(contentPattern.FullPattern));
                }

                var pages    = new List <CrawlerPage>();
                var pagesBot = new AbotProvider();

                var pagesTasks = new List <Task <List <CrawlerPage> > >();
                foreach (var URL in URLs.Where(x => !string.IsNullOrWhiteSpace(x)))
                {
                    pagesTasks.Add(pagesBot.CollectLinks(URL, pagesFilters));
                }
                Task.WaitAll(pagesTasks.ToArray());

                foreach (var task in pagesTasks)
                {
                    pages = pages.Union(task.Result).ToList();
                }

                var filtersInclude = new List <ICrawlerFilter>();
                filtersInclude.Add(new CrawlerVimeoVideoFilter());
                filtersInclude.Add(new CrawlerWistiaVideoFilter());
                filtersInclude.Add(new CrawlerYoutubeVideoFilter());
                filtersInclude.Add(new CrawlerPDFFilter());

                var filtersExclude = new List <ICrawlerFilter>();
                filtersExclude.Add(new CrawlerImageFilter());
                filtersExclude.Add(new CrawlerSocialMediaFilter());

                foreach (var contentPattern in contentPatterns)
                {
                    filtersInclude.Add(new CrawlerWildcardFilter(contentPattern.FullPattern));
                }

                var links    = new List <CrawlerPage>();
                var linksBot = new RbotProvider();

                var linksTasks = new List <Task <List <CrawlerPage> > >();
                foreach (var page in pages)
                {
                    linksTasks.Add(linksBot.CollectLinks(page.URL, filtersInclude, filtersExclude));
                }
                Task.WaitAll(linksTasks.ToArray());

                foreach (var task in linksTasks)
                {
                    links = links.Union(task.Result, new CrawlerPageComparer()).ToList();
                }

                widgetsBLL.UpdateCrawlHistoryCounters(processId, links?.Count ?? 0, 0);

                var contentIdsHashSet = new HashSet <string>(widgetsBLL.GetContentIds(UserId));

                Action <CrawlerPage> actionAddWidget = item =>
                {
                    try
                    {
                        // deduplicaion
                        var hashItemContentId = UrlHelper.GenerateUrlId(UrlHelper.GetManipulatedUrl(item.URL, AppSettings.CaseSensitiveDomains));
                        if (contentIdsHashSet.Contains(hashItemContentId))
                        {
                            return;
                        }

                        var tags   = tagsBLL.GetSuggestedTagsByUrlOrDefault(user.UserName, item.URL);
                        var tagIds = tags == null ? null : tags.Select(t => t.Id).ToList();

                        // execute "preview" function
                        var contentTask = contentsBLL.GetContentEntity(item.URL);
                        contentTask.Wait();
                        var content = contentTask.Result;

                        if (string.IsNullOrWhiteSpace(content.Title))
                        {
                            content.Title = Path.GetFileName(new Uri(item.URL).LocalPath);
                        }

                        // crop main image
                        var imageToCrop  = content.ImagesUrls.FirstOrDefault();
                        var croppedImage = ImageHelper.SmartCrop(imageToCrop, 150, 150);

                        var croppedImageURL = string.Empty;
                        if (croppedImage != null)
                        {
                            croppedImageURL = brandSettingsBLL.UploadImage(croppedImage, user.UserName);
                        }

                        /*
                         *  build MetaData as JsonStr:
                         *
                         *  [7] CONTENT
                         *  { content_text, content_image_url, content_url, cta_text, ImagesUrls: [], widget_title }
                         *
                         *  [8] VIDEO
                         *  { video_text, video_image_url, video_url, cta_text, ImagesUrls: [], video_type, widget_title }
                         */
                        var MetaData = string.Empty;  //  JSON PER TYPE (See DB structure)

                        if (item.ContentType == eCrawlerContentType.VIDEO)
                        {
                            MetaData = JsonConvert.SerializeObject(new
                            {
                                video_text      = content.Description,
                                video_image_url = croppedImageURL,
                                video_url       = item.URL,
                                cta_text        = "Watch Now",
                                content.ImagesUrls,
                                video_type   = content.ContentType,
                                widget_title = content.Title
                            });
                        }
                        else if (item.ContentType == eCrawlerContentType.CONTENT)
                        {
                            MetaData = JsonConvert.SerializeObject(new
                            {
                                content_text      = content.Description,
                                content_image_url = croppedImageURL,
                                content_url       = item.URL,
                                cta_text          = "Read More",
                                content.ImagesUrls,
                                widget_title = content.Title
                            });
                        }

                        var IsValid = (croppedImage != null && !string.IsNullOrEmpty(content.Title) && content.WordsCount > MIN_WORDS_COUNT); // content has image and valid title
                        numOfValidWidgets += IsValid ? 1 : 0;
                        var IsDraft = true;                                                                                                   // !IsValid || numOfValidWidgets < DISPLAY_WIDGETS_COUNT;
                        if (IsValid && numOfValidWidgets < DISPLAY_WIDGETS_COUNT)                                                             // only present the top 10 valid widgets
                        {
                            IsDraft = false;
                        }

                        if (IsValid)
                        {
                            var task = dropZoneBLL.AddPagesToURLMap(item.URL, brandSettings.WebstieScriptKey);
                            task.Wait();
                        }

                        long id = 0;
                        try {
                            id = widgetsBLL.Add(content.Title, MetaData, item.ContentType == eCrawlerContentType.VIDEO ? 8 : 7, true, user.UserName, tagIds, item.URL, null, IsDraft, IsValid);
                        }
                        catch (Exception ex) {
                            Log4NetTraceHelper.Error(typeof(CrawlerBLL), "Failed to generate a widget, error: ", ex);
                        }

                        if (id > 0)
                        {
                            numOfWidgets++;
                        }
                        widgetsBLL.UpdateCrawlHistoryCounters(processId, null, numOfWidgets);
                    }
                    catch (Exception ex) {
                        Log4NetTraceHelper.Error(typeof(CrawlerBLL), $"generic exception, url: '{item.URL}', error: ", ex);
                    }
                };

                links.AsParallel().ForAll(actionAddWidget);  // Create Widget

                widgetsBLL.SaveCrawlHistory(processId, DateTime.UtcNow, true, string.Empty, null, numOfWidgets);
                return(true);
            }
            catch (Exception ex)
            {
                widgetsBLL.SaveCrawlHistory(processId, DateTime.UtcNow, false, ex.Message, null, numOfWidgets);
                return(false);
            }
        }