/// <summary> /// TODO : Change to regex /// Extracts links leading to other threads and posts for indices crawling. /// </summary> /// <param name="htmlDump"> /// The HTML Dump. /// </param> /// <param name="url"> /// The Url. /// </param> /// <returns> /// The extract index URL's html. /// </returns> public static List<ImageInfo> ExtractIndexUrlsHtml(string htmlDump, string url) { List<ImageInfo> rtnList = new List<ImageInfo>(); const string StartHref = "<a "; const string Href = "href=\""; const string EndHref = "</a>"; // use only message content if (!string.IsNullOrEmpty(url) && url.StartsWith("http://") && url.Contains("#post")) { url = url.Substring(url.IndexOf("#post") + 5); string sMessageStart = string.Format("<div id=\"post_message_{0}\">", url); const string MessageEnd = "</blockquote>"; int iStart = htmlDump.IndexOf(sMessageStart); iStart += sMessageStart.Length; int iEnd = htmlDump.IndexOf(MessageEnd, iStart); htmlDump = htmlDump.Substring(iStart, iEnd - iStart); } string sCopy = htmlDump; /////////////////////////////////////////////// int iStartHref = sCopy.IndexOf(StartHref); if (iStartHref < 0) { return rtnList; } ////////////////////////////////////////////////////////////////////////// while (iStartHref >= 0) { // Thread.Sleep(1); int iHref = sCopy.IndexOf(Href, iStartHref); if (!(iHref >= 0)) { iStartHref = sCopy.IndexOf(StartHref, iStartHref + EndHref.Length); continue; } int iEndHref = sCopy.IndexOf(EndHref, iHref); if (iEndHref >= 0) { string substring = sCopy.Substring(iHref + Href.Length, iEndHref - (iHref + Href.Length)); sCopy = sCopy.Remove(iStartHref, iEndHref + EndHref.Length - iStartHref); iStartHref = substring.IndexOf("\" target=\"_blank\">"); if (iStartHref >= 0) { ImageInfo imgInfoIndexLink = new ImageInfo { ThumbnailUrl = string.Empty, ImageUrl = substring.Substring(0, iStartHref) }; if (imgInfoIndexLink.ImageUrl.Contains(@"showthread.php") || imgInfoIndexLink.ImageUrl.Contains(@"showpost.php")) { if (imgInfoIndexLink.ImageUrl.Contains("&")) { imgInfoIndexLink.ImageUrl = imgInfoIndexLink.ImageUrl.Remove(imgInfoIndexLink.ImageUrl.IndexOf("&")); } rtnList.Add(imgInfoIndexLink); } } } iStartHref = 0; iStartHref = sCopy.IndexOf(StartHref, iStartHref); } ////////////////////////////////////////////////////////////////////////// return rtnList; }
/// <summary> /// TODO : Change to Regex /// Get Post ids of all Posts /// </summary> /// <param name="htmlDump"> /// The html Dump. /// </param> /// <returns> /// The extract thread to posts html. /// </returns> public static List<ImageInfo> ExtractThreadtoPostsHtml(string htmlDump) { List<ImageInfo> rtnList = new List<ImageInfo>(); const string Start = "<a name=\"post"; string sEnd = "\" href"; string sCopy = htmlDump; int iStart = 0; iStart = sCopy.IndexOf(Start, iStart); while (iStart >= 0) { int iEnd = sCopy.IndexOf(sEnd, iStart); string sPostId = sCopy.Substring(iStart + Start.Length, iEnd - (iStart + Start.Length)); ImageInfo newThumbPicPool = new ImageInfo { ImageUrl = sPostId }; // iEnd = 0; if (Utility.IsNumeric(sPostId) && !string.IsNullOrEmpty(sPostId)) { rtnList.Add(newThumbPicPool); } iStart = sCopy.IndexOf(Start, iStart + sEnd.Length); } return rtnList; }
/// <summary> /// Attempts to extract hot linked and thumb->FullScale images. /// </summary> /// <param name="strDump"> /// The STR dump. /// </param> /// <returns> /// The extract images. /// </returns> public static List<ImageInfo> ExtractImages(string strDump) { List<ImageInfo> rtnList = new List<ImageInfo>(); Hashtable rtnHashChk = new Hashtable(); try { DataSet ds = new DataSet(); ds.ReadXml(new StringReader(strDump)); foreach (DataRow row in ds.Tables["Image"].Rows) { string thumbUrl; try { thumbUrl = row["thumb_url"].ToString(); } catch (Exception) { thumbUrl = string.Empty; } ImageInfo newPicPool = new ImageInfo { ImageUrl = row["main_url"].ToString(), ThumbnailUrl = thumbUrl }; newPicPool.ImageUrl = Regex.Replace(newPicPool.ImageUrl, @"""", string.Empty); ////////////////////////////////////////////////////////////////////////// if (Utility.IsImageNoneSense(newPicPool.ImageUrl)) { continue; } newPicPool.ImageUrl = Utility.ReplaceHexWithAscii(newPicPool.ImageUrl); // Remove anonym.to from Link if exists if (newPicPool.ImageUrl.Contains("anonym.to")) { newPicPool.ImageUrl = newPicPool.ImageUrl.Replace("http://www.anonym.to/?", string.Empty); } // Remove redirect if (newPicPool.ImageUrl.Contains("redirect-to")) { newPicPool.ImageUrl = newPicPool.ImageUrl.Replace( string.Format( "{0}redirect-to/?redirect=", CacheController.Instance().UserSettings.ForumURL), string.Empty); } // Get Real Url if (newPicPool.ImageUrl.Contains("/out/out.php?x=")) { var req = (HttpWebRequest)WebRequest.Create(newPicPool.ImageUrl); req.Referer = newPicPool.ImageUrl; req.Timeout = 20000; var res = (HttpWebResponse)req.GetResponse(); newPicPool.ImageUrl = res.ResponseUri.ToString(); res.Close(); } if (rtnHashChk.Contains(newPicPool.ImageUrl)) { continue; } rtnList.Add(newPicPool); rtnHashChk.Add(newPicPool.ImageUrl, "OK"); } } catch (Exception ex) { MessageBox.Show(string.Format("{0}\n{1}", ex.Message, ex.StackTrace)); } return rtnList; }