/// <summary> /// Extract the Current Post Title if there is any /// if not use PostId As Title /// </summary> /// <param name="content"> /// The content. /// </param> /// <param name="url"> /// The url. /// </param> /// <returns> /// The extract post title from html. /// </returns> public string ExtractPostTitleFromHtml(string content, string url) { var postId = url.Substring(url.IndexOf("p=", StringComparison.Ordinal) + 2); var check = string.Format( @"<h2 class=\""title icon\"">\r\n\t\t\t\t\t(?<inner>[^\r]*)\r\n\t\t\t\t</h2>\r\n\t\t\t\t\r\n\r\n\r\n\t\t\t\t\t\t\r\n\t\t\t\t\t\t\r\n\t\t\t\t<div class=\""content\"">\r\n\t\t\t\t\t<div id=\""post_message_{0}\"">", postId); var check2 = string.Format( @"<h2 class=\""title icon\"">\r\n\t\t\t\t\t(?<inner>[^\r]*)\r\n\t\t\t\t</h2>\r\n\t\t\t\t\r\n\r\n\r\n\t\t\t\t\t\t\r\n\t\t\t\t\t\t\t\r\n\t\t\t\t\t\t\t\r\n\t\t\t\t\t\t\r\n\t\t\t\t\t\t\r\n\t\t\t\t\t\t\t\r\n\t\t\t\t\t\t\r\n\t\t\t\t<div class=\""content\"">\r\n\t\t\t\t\t<div id=\""post_message_{0}\"">", postId); var match = Regex.Match(content, check, RegexOptions.Compiled); var postTitle = string.Empty; if (!match.Success) { match = Regex.Match(content, check2, RegexOptions.Compiled); if (!match.Success) { return(postTitle); } } postTitle = match.Groups["inner"].Value.Trim(); if (postTitle == string.Empty) { postTitle = string.Format("post# {0}", postId); } else if (postTitle == string.Format("Re: {0}", this.ExtractTopicTitleFromHtml(content))) { postTitle = string.Format("post# {0}", postId); } // Remove Topic Icons if found if (postTitle.Contains("<img")) { postTitle = postTitle.Substring(postTitle.IndexOf(" /> ", StringComparison.Ordinal) + 4); } return(Utility.ReplaceHexWithAscii(postTitle)); }
/// <summary> /// Attempts to extract hot linked and thumb->FullScale images. /// </summary> /// <param name="htmlDump"> /// The html Dump. /// </param> /// <param name="postId"> /// The Post Id. /// </param> /// <returns> /// The extract images links html. /// </returns> public static List <ImageInfo> ExtractImagesLinksHtml(string htmlDump, string postId) { if (!string.IsNullOrEmpty(postId) && postId.StartsWith("http://")) { postId = postId.Substring(postId.IndexOf("#post") + 5); } htmlDump = htmlDump.Replace("&", "&"); // use only message content var sMessageStart = string.Format("<div id=\"post_message_{0}\">", postId); var sMessageEnd = "</blockquote>"; var iStart = htmlDump.IndexOf(sMessageStart); iStart += sMessageStart.Length; var iEnd = htmlDump.IndexOf(sMessageEnd, iStart); htmlDump = htmlDump.Substring(iStart, iEnd - iStart); /////////////////////////////////////////////// // Parse all Links <a> var rtnList = LinkFinder.ListAllLinks(htmlDump).Select( link => new ImageInfo { ImageUrl = RemoveRedirectLink(Utility.ReplaceHexWithAscii(link.Href)), ThumbnailUrl = Utility.ReplaceHexWithAscii(link.Text) }).Where(newPicPoolItem => !Utility.IsImageNoneSense(newPicPoolItem.ImageUrl) && !Utility.IsImageNoneSense(newPicPoolItem.ThumbnailUrl)).ToList(); // Parse all Image <a> rtnList.AddRange( LinkFinder.ListAllImages(htmlDump).Select( link => new ImageInfo { ImageUrl = RemoveRedirectLink(Utility.ReplaceHexWithAscii(link.Href)), ThumbnailUrl = Utility.ReplaceHexWithAscii(link.Text) })); return(rtnList); }
/// <summary> /// Extracts links leading to other threads and posts for indices crawling. /// </summary> /// <param name="xmlDump"> /// The XML dump. /// </param> /// <returns> /// The extract rip URL's. /// </returns> public static List <ImageInfo> ExtractRiPUrls(string xmlDump) { var rtnList = new List <ImageInfo>(); var rtnHashChk = new Hashtable(); try { var ds = new DataSet(); ds.ReadXml(new StringReader(xmlDump)); foreach (var newPicPool in from DataRow row in ds.Tables["Image"].Rows select new ImageInfo { ImageUrl = row["main_url"].ToString(), ThumbnailUrl = row["thumb_url"] != null ? row["thumb_url"].ToString() : string.Empty }) { newPicPool.ImageUrl = Utility.ReplaceHexWithAscii(newPicPool.ImageUrl); if (rtnHashChk.Contains(newPicPool.ImageUrl)) { continue; } rtnList.Add(newPicPool); rtnHashChk.Add(newPicPool.ImageUrl, "OK"); } } catch (Exception ex) { MessageBox.Show($"{ex.Message}\n{ex.StackTrace}"); } return(rtnList); }
/// <summary> /// Extracts the thread to posts. /// </summary> /// <param name="xmlDump"> /// The XML dump. /// </param> /// <returns> /// The extract thread to posts. /// </returns> public static List <ImageInfo> ExtractThreadtoPosts(string xmlDump) { var rtnList = new List <ImageInfo>(); var rtnHashChk = new Hashtable(); try { var ds = new DataSet(); ds.ReadXml(new StringReader(xmlDump)); foreach (var newPicPool in ds.Tables["post"].Rows.Cast <DataRow>() .Where(row => row["id"] != null) .Select(row => new ImageInfo { ImageUrl = row["id"].ToString() })) { newPicPool.ImageUrl = Utility.ReplaceHexWithAscii(newPicPool.ImageUrl); if (rtnHashChk.Contains(newPicPool.ImageUrl)) { continue; } rtnList.Add(newPicPool); rtnHashChk.Add(newPicPool.ImageUrl, "OK"); } } catch (Exception ex) { Utility.SaveOnCrash(xmlDump, ex.StackTrace, null); } return(rtnList); }
protected override bool DoDownload() { var strImgURL = this.ImageLinkURL; if (this.EventTable.ContainsKey(strImgURL)) { return(true); } try { if (!Directory.Exists(this.SavePath)) { Directory.CreateDirectory(this.SavePath); } } catch (IOException ex) { // MainForm.DeleteMessage = ex.Message; // MainForm.Delete = true; return(false); } var strFilePath = strImgURL.Substring(strImgURL.LastIndexOf("/") + 1).Replace(".html", string.Empty); strFilePath = Path.Combine(this.SavePath, Utility.RemoveIllegalCharecters(strFilePath)); var CCObj = new CacheObject(); CCObj.IsDownloaded = false; CCObj.FilePath = strFilePath; CCObj.Url = strImgURL; try { this.EventTable.Add(strImgURL, CCObj); } catch (ThreadAbortException) { return(true); } catch (Exception) { if (this.EventTable.ContainsKey(strImgURL)) { return(false); } else { this.EventTable.Add(strImgURL, CCObj); } } var strIVPage = this.GetImageHostPage(ref strImgURL); if (strIVPage.Length < 10) { return(false); } var iStartIMG = 0; var iEndSRC = 0; iStartIMG = strIVPage.IndexOf("<img id=\"thepic\" name=\"thisimage\" src=\""); if (iStartIMG < 0) { return(false); } iStartIMG += 39; iEndSRC = strIVPage.IndexOf("\" width", iStartIMG); if (iEndSRC < 0) { return(false); } var strNewURL = Utility.ReplaceHexWithAscii(strIVPage.Substring(iStartIMG, iEndSRC - iStartIMG)); ////////////////////////////////////////////////////////////////////////// var NewAlteredPath = Utility.GetSuitableName(strFilePath); if (strFilePath != NewAlteredPath) { strFilePath = NewAlteredPath; ((CacheObject)this.EventTable[this.ImageLinkURL]).FilePath = strFilePath; } try { var client = new WebClient(); client.Headers.Add("Referer: " + strImgURL); client.Headers.Add( "User-Agent: Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US; rv:1.7.10) Gecko/20050716 Firefox/1.0.6"); client.DownloadFile(strNewURL, strFilePath); client.Dispose(); } catch (ThreadAbortException) { ((CacheObject)this.EventTable[strImgURL]).IsDownloaded = false; ThreadManager.GetInstance().RemoveThreadbyId(this.ImageLinkURL); return(true); } catch (IOException ex) { // MainForm.DeleteMessage = ex.Message; // MainForm.Delete = true; ((CacheObject)this.EventTable[strImgURL]).IsDownloaded = false; ThreadManager.GetInstance().RemoveThreadbyId(this.ImageLinkURL); return(true); } catch (WebException) { ((CacheObject)this.EventTable[strImgURL]).IsDownloaded = false; ThreadManager.GetInstance().RemoveThreadbyId(this.ImageLinkURL); return(false); } ((CacheObject)this.EventTable[this.ImageLinkURL]).IsDownloaded = true; // CacheController.GetInstance().u_s_LastPic = ((CacheObject)eventTable[mstrURL]).FilePath; CacheController.Instance().LastPic = ((CacheObject)this.EventTable[this.ImageLinkURL]).FilePath = strFilePath; return(true); }
/// <summary> /// Attempts to extract hot linked and thumb->FullScale images. /// </summary> /// <param name="htmlDump">The html Dump.</param> /// <param name="postId">The post identifier.</param> /// <returns> /// The extract attachment images html. /// </returns> public static List <ImageInfo> ExtractAttachmentImagesHtml(string htmlDump, string postId) { var rtnList = new List <ImageInfo>(); htmlDump = htmlDump.Replace("&", "&"); var start = "<div class=\"attachments\">"; var end = "<!-- / attachments -->"; // use only message content var iStart = htmlDump.IndexOf(start, System.StringComparison.Ordinal); if (iStart < 0) { // fix post id if (postId.Contains("#post")) { postId = postId.Substring(postId.IndexOf("#post", System.StringComparison.Ordinal) + 5); } start = string.Format("<div id=\"post_message_{0}\">", postId); end = "</blockquote>"; iStart = htmlDump.IndexOf(start, System.StringComparison.Ordinal); if (iStart < 0) { // Return Empty List return(rtnList); } iStart += start.Length; var startDump = htmlDump.Substring(iStart); var iEnd = startDump.IndexOf(end, System.StringComparison.Ordinal); if (iEnd > 0) { htmlDump = startDump.Remove(iEnd); } } else { iStart += start.Length; var iEnd = htmlDump.IndexOf(end, System.StringComparison.Ordinal); if (iEnd > 0) { htmlDump = htmlDump.Substring(iStart, iEnd - iStart); } } /////////////////////////////////////////////// rtnList.AddRange( LinkFinder.ListAllLinks(htmlDump) .Select( link => new ImageInfo { ImageUrl = link.Href.StartsWith("http://") ? link.Href : CacheController.Instance().UserSettings.CurrentForumUrl + Utility.ReplaceHexWithAscii(link.Href), ThumbnailUrl = string.Empty }) .Where(newPicPoolItem => !Utility.IsImageNoneSense(newPicPoolItem.ImageUrl))); return(rtnList); }
/// <summary> /// Extract the Current Post Title if there is any /// if not use PostId As Title /// </summary> /// <param name="content"> /// The content. /// </param> /// <param name="url"> /// The url. /// </param> /// <returns> /// The extract post title from html. /// </returns> public string ExtractPostTitleFromHtml(string content, string url) { string postTitle; //////////////////////////////////// // Extract Current Post first var sPostId = url.Substring(url.IndexOf("#post") + 5); // use only message content var sMessageStart = string.Format( "<li class=\"postbitlegacy postbitim postcontainer\" id=\"post_{0}\">", sPostId); const string MessageEnd = "</blockquote>"; var iStart = content.IndexOf(sMessageStart); iStart += sMessageStart.Length; var iEnd = content.IndexOf(MessageEnd, iStart); var pageContent = content.Substring(iStart, iEnd - iStart); ///////////////////////////////// const string TitleStart = "<h2 class=\"title icon\">"; var iTitleStart = pageContent.IndexOf(TitleStart); iTitleStart += TitleStart.Length; var iTitleEnd = pageContent.IndexOf("</h2>", iTitleStart); try { postTitle = pageContent.Substring(iTitleStart, iTitleEnd - iTitleStart) .Replace("\r", string.Empty) .Replace("\t", string.Empty) .Replace("\n", string.Empty); // Remove Post Icon if (postTitle.StartsWith("<img src=")) { postTitle = postTitle.Substring(postTitle.IndexOf("/>") + 3); } } catch (Exception) { postTitle = string.Empty; } if (string.IsNullOrEmpty(postTitle)) { postTitle = string.Format("post# {0}", url.Substring(url.IndexOf(@"#post") + 5)); } else { return(Utility.ReplaceHexWithAscii(postTitle)); } return(Utility.ReplaceHexWithAscii(postTitle)); }
/// <summary> /// Attempts to extract hot linked and thumb->FullScale images. /// </summary> /// <param name="strDump"> /// The STR dump. /// </param> /// <returns> /// The extract images. /// </returns> public static List <ImageInfo> ExtractImages(string strDump) { var rtnList = new List <ImageInfo>(); var rtnHashChk = new Hashtable(); try { var ds = new DataSet(); ds.ReadXml(new StringReader(strDump)); foreach (DataRow row in ds.Tables["Image"].Rows) { string thumbUrl; try { thumbUrl = row["thumb_url"].ToString(); } catch (Exception) { thumbUrl = string.Empty; } var newPicPool = new ImageInfo { ImageUrl = row["main_url"].ToString(), ThumbnailUrl = thumbUrl }; newPicPool.ImageUrl = Regex.Replace(newPicPool.ImageUrl, @"""", string.Empty); ////////////////////////////////////////////////////////////////////////// if (Utility.IsImageNoneSense(newPicPool.ImageUrl)) { continue; } newPicPool.ImageUrl = Utility.ReplaceHexWithAscii(newPicPool.ImageUrl); // Remove anonym.to from Link if exists if (newPicPool.ImageUrl.Contains("anonym.to")) { newPicPool.ImageUrl = newPicPool.ImageUrl.Replace("http://www.anonym.to/?", string.Empty); } // Remove redirect if (newPicPool.ImageUrl.Contains("redirect-to")) { newPicPool.ImageUrl = newPicPool.ImageUrl.Replace( $"{CacheController.Instance().UserSettings.ForumURL}redirect-to/?redirect=", string.Empty); } // Get Real Url if (newPicPool.ImageUrl.Contains("/out/out.php?x=")) { var req = (HttpWebRequest)WebRequest.Create(newPicPool.ImageUrl); req.Referer = newPicPool.ImageUrl; req.Timeout = 20000; var res = (HttpWebResponse)req.GetResponse(); newPicPool.ImageUrl = res.ResponseUri.ToString(); res.Close(); } if (rtnHashChk.Contains(newPicPool.ImageUrl)) { continue; } rtnList.Add(newPicPool); rtnHashChk.Add(newPicPool.ImageUrl, "OK"); } } catch (Exception ex) { MessageBox.Show($"{ex.Message}\n{ex.StackTrace}"); } return(rtnList); }