/// <summary> /// Performs image handling. /// </summary> /// <param name="content">The content.</param> /// <param name="title">The title.</param> /// <param name="rssKey">The RSS key.</param> protected IEnumerable <Task> HandleImages(string content, string title, RssKey rssKey = null) { if (content != null && title != null) { var html = new HtmlDocument(); html.LoadHtml(content); return(HandleImages(html, title, rssKey)); } return(new Task[0]); }
/// <summary> /// Performs image handling. /// </summary> /// <param name="doc">The document.</param> /// <param name="title">The title.</param> /// <param name="rssKey">The RSS key.</param> protected IEnumerable <Task> HandleImages(HtmlDocument doc, string title, RssKey rssKey = null) { var jobList = new List <Task>(); if (doc != null && title != null) { var images = doc.GetAllImages(); var link = rssKey != null ? rssKey.Link : Source.Uri; var published = rssKey != null ? new DateTime(rssKey.Updated) : DateTime.UtcNow; if (images != null) { foreach (var image in images) { jobList.Add(Observer.NotifyNewImageContentFoundThreadSafe(title, image.Item2, image.Item1, link, published, Source)); } } } return(jobList); }
/// <summary> /// Notifies the Observer that a (probably) new Rss feed item has been detected by a crawler. /// Note: Must be thread safe /// </summary> /// <param name="item">The item to index</param> /// <param name="rssKey">The belonging rss key.</param> /// <param name="source">The source.</param> /// <returns>If indexing is required: a task with the corresponding indexing job. /// <para>Otherwise: a completed Task</para></returns> public Task NotifyNewRssFeedFoundThreadSave(SyndicationItem item, RssKey rssKey, Source source) { var plainText = item.Summary?.Text?.GetTextFromHtml() ?? string.Empty; var plainTitle = item.Title?.Text.GetTextFromHtml() ?? source.Name; if (DoesNotViolateBlackList(source, plainText) && DoesNotViolateBlackList(source, plainTitle)) { var article = new Article(Guid.NewGuid(), plainTitle, rssKey.Link, null, plainText, new DateTime(rssKey.Updated), DateTime.UtcNow, source.Id); var job = DatabaseAdapterFactory.GetControllerInstance <IIndexerService>() .IndexRssFeedItemThreadSafeAsync(article, rssKey); return(job ?? Task.CompletedTask); } return(Task.CompletedTask); }
public void IndexRssItemTest() { var elasticController = DatabaseAdapterFactory.GetControllerInstance <IIndexerService>(); var milliseconds = DateTimeOffset.Now.ToUnixTimeMilliseconds(); var guid = Guid.NewGuid(); var indexedguid = Guid.NewGuid(); var articleOne = new Article(guid, "abgef:" + milliseconds, "http://" + milliseconds + ".de", "http://" + milliseconds + ".de", "abgefssmt", DateTime.Today, DateTime.Today, indexedguid); var rsskey = new RssKey(milliseconds, articleOne.Link); try { elasticController.IndexRssFeedItemThreadSafeAsync(articleOne, rsskey); Assert.IsTrue(true); } catch (Exception) { Assert.Fail(); } }
/// <summary> /// Indexes an RSS article thread safe. /// </summary> /// <param name="article">The article.</param> /// <param name="key">The rss key.</param> public async Task IndexRssFeedItemThreadSafeAsync(Article article, RssKey key) { #if ST lock (lockObj) #endif { try { //using (new PerfTracer(nameof(IndexRssFeedItemThreadSafeAsync))) { var client = new ElasticClient(settings); var speedKey = new ShortArticleKey(article.IndexingSourceId, key.Updated, key.Link); #if ST if (!(IsArticleDuplicate(speedKey, client).Result)) #else if (!(await IsArticleDuplicate(speedKey, client))) #endif { #if ST var result = IndexArticle(article, speedKey, client).Result; #else var result = await IndexArticle(article, speedKey, client); #endif if (result.Status == SuccessState.UnknownError) { throw new Exception("IndexArticle failed: " + result.UserMessage); } } } } catch (Exception e) { Console.WriteLine(e); } } }
/// <summary> /// Handles a given document (Xpath and regex) /// </summary> /// <param name="doc">The doc to handle.</param> /// <param name="title">The title (optional).</param> /// <param name="rssKey">The rss key (optional).</param> internal IEnumerable <Task> HandleDocument(HtmlDocument doc, string title = null, RssKey rssKey = null) { if (string.Equals(doc.Text, this.lastFetchedContent, StringComparison.Ordinal)) { return(new Task[0]); } this.lastFetchedContent = doc.Text; //First possibility (makes nearly no sense): No filer -> direct index var link = rssKey != null ? rssKey.Link : Source.Uri; var published = rssKey != null ? new DateTime(rssKey.Updated) : DateTime.UtcNow; var noXPath = string.IsNullOrEmpty(Source.XPath); var noRegex = this.regex == null; var jobList = new List <Task>(); if (noRegex && noXPath) { title = title ?? GetTitle(doc); jobList.Add(Observer.NotifyNewWebPageContentFoundThreadSafe(title, doc.GetTextFromHtml(), link, published, Source)); jobList.AddRange(HandleImages(doc, title, rssKey)); return(jobList); } if (noXPath) { title = title ?? GetTitle(doc); jobList.AddRange(HandleRegex(doc.Text, title, link, published)); } else { var relevantNodes = doc.DocumentNode.SelectNodes(Source.XPath); foreach (var node in relevantNodes) { if (noRegex) { title = title ?? GetTitle(doc); jobList.Add(Observer.NotifyNewWebPageContentFoundThreadSafe(title, node.InnerHtml.GetTextFromHtml(), link, published, Source)); jobList.AddRange(HandleImages(node.InnerHtml, title, rssKey)); } else { title = title ?? GetTitle(doc); jobList.AddRange(HandleRegex(node.InnerHtml, title, link, published)); } } } return(jobList); }