Example #1
0
        /// <summary>
        /// Performs image handling.
        /// </summary>
        /// <param name="content">The content.</param>
        /// <param name="title">The title.</param>
        /// <param name="rssKey">The RSS key.</param>
        protected IEnumerable <Task> HandleImages(string content, string title, RssKey rssKey = null)
        {
            if (content != null && title != null)
            {
                var html = new HtmlDocument();
                html.LoadHtml(content);
                return(HandleImages(html, title, rssKey));
            }

            return(new Task[0]);
        }
Example #2
0
        /// <summary>
        /// Performs image handling.
        /// </summary>
        /// <param name="doc">The document.</param>
        /// <param name="title">The title.</param>
        /// <param name="rssKey">The RSS key.</param>
        protected IEnumerable <Task> HandleImages(HtmlDocument doc, string title, RssKey rssKey = null)
        {
            var jobList = new List <Task>();

            if (doc != null && title != null)
            {
                var images    = doc.GetAllImages();
                var link      = rssKey != null ? rssKey.Link : Source.Uri;
                var published = rssKey != null ? new DateTime(rssKey.Updated) : DateTime.UtcNow;
                if (images != null)
                {
                    foreach (var image in images)
                    {
                        jobList.Add(Observer.NotifyNewImageContentFoundThreadSafe(title, image.Item2, image.Item1, link,
                                                                                  published, Source));
                    }
                }
            }

            return(jobList);
        }
        /// <summary>
        /// Notifies the Observer that a (probably) new Rss feed item has been detected by a crawler.
        /// Note: Must be thread safe
        /// </summary>
        /// <param name="item">The item to index</param>
        /// <param name="rssKey">The belonging rss key.</param>
        /// <param name="source">The source.</param>
        /// <returns>If indexing is required: a task with the corresponding indexing job.
        /// <para>Otherwise: a completed Task</para></returns>
        public Task NotifyNewRssFeedFoundThreadSave(SyndicationItem item, RssKey rssKey, Source source)
        {
            var plainText  = item.Summary?.Text?.GetTextFromHtml() ?? string.Empty;
            var plainTitle = item.Title?.Text.GetTextFromHtml() ?? source.Name;

            if (DoesNotViolateBlackList(source, plainText) && DoesNotViolateBlackList(source, plainTitle))
            {
                var article = new Article(Guid.NewGuid(),
                                          plainTitle,
                                          rssKey.Link,
                                          null,
                                          plainText,
                                          new DateTime(rssKey.Updated),
                                          DateTime.UtcNow,
                                          source.Id);
                var job = DatabaseAdapterFactory.GetControllerInstance <IIndexerService>()
                          .IndexRssFeedItemThreadSafeAsync(article, rssKey);
                return(job ?? Task.CompletedTask);
            }

            return(Task.CompletedTask);
        }
        public void IndexRssItemTest()
        {
            var elasticController = DatabaseAdapterFactory.GetControllerInstance <IIndexerService>();
            var milliseconds      = DateTimeOffset.Now.ToUnixTimeMilliseconds();
            var guid        = Guid.NewGuid();
            var indexedguid = Guid.NewGuid();
            var articleOne  = new Article(guid, "abgef:" + milliseconds,
                                          "http://" + milliseconds + ".de", "http://" + milliseconds + ".de", "abgefssmt",
                                          DateTime.Today, DateTime.Today, indexedguid);

            var rsskey = new RssKey(milliseconds, articleOne.Link);

            try
            {
                elasticController.IndexRssFeedItemThreadSafeAsync(articleOne, rsskey);
                Assert.IsTrue(true);
            }
            catch (Exception)
            {
                Assert.Fail();
            }
        }
Example #5
0
        /// <summary>
        /// Indexes an RSS article thread safe.
        /// </summary>
        /// <param name="article">The article.</param>
        /// <param name="key">The rss key.</param>
        public async Task IndexRssFeedItemThreadSafeAsync(Article article, RssKey key)
        {
#if ST
            lock (lockObj)
#endif
            {
                try
                {
                    //using (new PerfTracer(nameof(IndexRssFeedItemThreadSafeAsync)))
                    {
                        var client   = new ElasticClient(settings);
                        var speedKey = new ShortArticleKey(article.IndexingSourceId, key.Updated, key.Link);
#if ST
                        if (!(IsArticleDuplicate(speedKey, client).Result))
#else
                        if (!(await IsArticleDuplicate(speedKey, client)))
#endif
                        {
#if ST
                            var result = IndexArticle(article, speedKey, client).Result;
#else
                            var result = await IndexArticle(article, speedKey, client);
#endif

                            if (result.Status == SuccessState.UnknownError)
                            {
                                throw new Exception("IndexArticle failed: " + result.UserMessage);
                            }
                        }
                    }
                }
                catch (Exception e)
                {
                    Console.WriteLine(e);
                }
            }
        }
        /// <summary>
        /// Handles a given document (Xpath and regex)
        /// </summary>
        /// <param name="doc">The doc to handle.</param>
        /// <param name="title">The title (optional).</param>
        /// <param name="rssKey">The rss key (optional).</param>
        internal IEnumerable <Task> HandleDocument(HtmlDocument doc, string title = null, RssKey rssKey = null)
        {
            if (string.Equals(doc.Text, this.lastFetchedContent, StringComparison.Ordinal))
            {
                return(new Task[0]);
            }

            this.lastFetchedContent = doc.Text;

            //First possibility (makes nearly no sense): No filer -> direct index
            var link      = rssKey != null ? rssKey.Link : Source.Uri;
            var published = rssKey != null ? new DateTime(rssKey.Updated) : DateTime.UtcNow;
            var noXPath   = string.IsNullOrEmpty(Source.XPath);
            var noRegex   = this.regex == null;
            var jobList   = new List <Task>();

            if (noRegex && noXPath)
            {
                title = title ?? GetTitle(doc);
                jobList.Add(Observer.NotifyNewWebPageContentFoundThreadSafe(title, doc.GetTextFromHtml(), link, published, Source));
                jobList.AddRange(HandleImages(doc, title, rssKey));
                return(jobList);
            }

            if (noXPath)
            {
                title = title ?? GetTitle(doc);
                jobList.AddRange(HandleRegex(doc.Text, title, link, published));
            }
            else
            {
                var relevantNodes = doc.DocumentNode.SelectNodes(Source.XPath);
                foreach (var node in relevantNodes)
                {
                    if (noRegex)
                    {
                        title = title ?? GetTitle(doc);
                        jobList.Add(Observer.NotifyNewWebPageContentFoundThreadSafe(title,
                                                                                    node.InnerHtml.GetTextFromHtml(),
                                                                                    link, published, Source));
                        jobList.AddRange(HandleImages(node.InnerHtml, title, rssKey));
                    }
                    else
                    {
                        title = title ?? GetTitle(doc);
                        jobList.AddRange(HandleRegex(node.InnerHtml, title, link, published));
                    }
                }
            }

            return(jobList);
        }