private static void SetTitleBody(TelegramPost post, string txt) { if (post == null) { return; } int brIndex = txt.IndexOf("<br", StringComparison.Ordinal); if (brIndex > -1) { post.PossibleTitle = txt.Substring(0, brIndex).RemoveHtmlTags(); if (post.PostType == TelegramPostType.Text) { int afterBrIndex = brIndex + 4; // <br> length = 4 post.Body = txt.Substring(afterBrIndex, Math.Max(txt.Length - afterBrIndex, 0)); } else { post.Body = txt; } } else { var possibleTitle = txt.RemoveHtmlTags(); if (possibleTitle.Length > MAX_TITLE_LENGTH) { post.PossibleTitle = possibleTitle.Substring(0, MAX_TITLE_LENGTH); post.Body = txt; } else { post.PossibleTitle = possibleTitle; post.Body = post.PostType == TelegramPostType.Text ? txt : ""; } } }
private void SetAttachmentUri(TelegramPost post, HtmlNode photoNode, HtmlNode videoNode, HtmlNode stickerNode) { string url = null; switch (post.PostType) { case TelegramPostType.Photo: url = ParsePhotoUrl(photoNode); break; case TelegramPostType.Video: url = videoNode?.Attributes["src"]?.Value; break; case TelegramPostType.Sticker: url = ParseStickerUrl(stickerNode); break; } if (!string.IsNullOrWhiteSpace(url) && Uri.TryCreate(url, UriKind.Absolute, out var uri)) { post.AttachmentUri = uri; } }
protected TelegramPost Parse(HtmlDocument htmlDocument) { if (htmlDocument == null) { return(null); } var divText = htmlDocument.DocumentNode.Descendants("div") .LastOrDefault(m => m.HasClass("tgme_widget_message_text")); var video = htmlDocument.DocumentNode.Descendants("video")? .LastOrDefault(); var aPhoto = htmlDocument.DocumentNode.Descendants("a") .FirstOrDefault(m => m.HasClass("tgme_widget_message_photo_wrap")) ?? htmlDocument.DocumentNode.Descendants("i") .FirstOrDefault(m => m.HasClass("link_preview_image")); var iSticker = htmlDocument.DocumentNode.Descendants("i") .FirstOrDefault(m => m.HasClass("tgme_widget_message_sticker")); var aDocument = htmlDocument.DocumentNode.Descendants("a") .FirstOrDefault(m => m.HasClass("tgme_widget_message_document_wrap")); if (aPhoto == null && divText == null && video == null && iSticker == null && aDocument == null) { // log ("Post {0} has no photo an no message Text.", postId); return(null); } // log "Parsing post {0}. ", postId; var spans = htmlDocument.DocumentNode.Descendants("span")?.ToList(); var athuorNode = spans?.LastOrDefault(m => m.HasClass("tgme_widget_message_from_author")); var dateNode = spans? .SingleOrDefault(s => s.HasClass("tgme_widget_message_meta"))?.Descendants("a")? .SingleOrDefault(s => s.HasClass("tgme_widget_message_date"))?.Descendants("time")? .FirstOrDefault(); var viewNode = spans?.LastOrDefault(m => m.HasClass("tgme_widget_message_views")); var post = new TelegramPost { Id = ParsePostId(htmlDocument), PostType = video != null ? TelegramPostType.Video : aPhoto != null ? TelegramPostType.Photo : iSticker != null ? TelegramPostType.Sticker : aDocument != null ? TelegramPostType.File : TelegramPostType.Text, WebRaw = htmlDocument.ParsedText, TextRaw = divText?.InnerText, Author = athuorNode?.InnerText ?? string.Empty, DateString = dateNode?.InnerText, ViewCount = viewNode?.InnerText }; SetAttachmentUri(post, aPhoto, video, iSticker); SetTitleBody(post, divText?.InnerHtml); if (DateTime.TryParse(dateNode?.Attributes["datetime"]?.Value, out var date)) { post.Date = date; } return(post); }