コード例 #1
0
ファイル: ExtractHelper.cs プロジェクト: 0xf836/ripper
        /// <summary>
        /// TODO : Change to regex
        /// Extracts links leading to other threads and posts for indices crawling.
        /// </summary>
        /// <param name="htmlDump">
        /// The HTML Dump.
        /// </param>
        /// <param name="url">
        /// The Url.
        /// </param>
        /// <returns>
        /// The extract index URL's html.
        /// </returns>
        public static List<ImageInfo> ExtractIndexUrlsHtml(string htmlDump, string url)
        {
            List<ImageInfo> rtnList = new List<ImageInfo>();

            const string StartHref = "<a ";
            const string Href = "href=\"";
            const string EndHref = "</a>";

            // use only message content
            if (!string.IsNullOrEmpty(url) && url.StartsWith("http://") && url.Contains("#post"))
            {
                url = url.Substring(url.IndexOf("#post") + 5);

                string sMessageStart = string.Format("<div id=\"post_message_{0}\">", url);
                const string MessageEnd = "</blockquote>";

                int iStart = htmlDump.IndexOf(sMessageStart);

                iStart += sMessageStart.Length;

                int iEnd = htmlDump.IndexOf(MessageEnd, iStart);

                htmlDump = htmlDump.Substring(iStart, iEnd - iStart);
            }

            string sCopy = htmlDump;

            ///////////////////////////////////////////////
            int iStartHref = sCopy.IndexOf(StartHref);

            if (iStartHref < 0)
            {
                return rtnList;
            }

            //////////////////////////////////////////////////////////////////////////

            while (iStartHref >= 0)
            {
                // Thread.Sleep(1);
                int iHref = sCopy.IndexOf(Href, iStartHref);

                if (!(iHref >= 0))
                {
                    iStartHref = sCopy.IndexOf(StartHref, iStartHref + EndHref.Length);
                    continue;
                }

                int iEndHref = sCopy.IndexOf(EndHref, iHref);

                if (iEndHref >= 0)
                {
                    string substring = sCopy.Substring(iHref + Href.Length, iEndHref - (iHref + Href.Length));
                    sCopy = sCopy.Remove(iStartHref, iEndHref + EndHref.Length - iStartHref);

                    iStartHref = substring.IndexOf("\" target=\"_blank\">");

                    if (iStartHref >= 0)
                    {
                        ImageInfo imgInfoIndexLink = new ImageInfo { ThumbnailUrl = string.Empty, ImageUrl = substring.Substring(0, iStartHref) };

                        if (imgInfoIndexLink.ImageUrl.Contains(@"showthread.php") ||
                            imgInfoIndexLink.ImageUrl.Contains(@"showpost.php"))
                        {
                            if (imgInfoIndexLink.ImageUrl.Contains("&amp;"))
                            {
                                imgInfoIndexLink.ImageUrl =
                                    imgInfoIndexLink.ImageUrl.Remove(imgInfoIndexLink.ImageUrl.IndexOf("&amp;"));
                            }

                            rtnList.Add(imgInfoIndexLink);
                        }
                    }
                }

                iStartHref = 0;
                iStartHref = sCopy.IndexOf(StartHref, iStartHref);
            }

            //////////////////////////////////////////////////////////////////////////

            return rtnList;
        }
コード例 #2
0
ファイル: ExtractHelper.cs プロジェクト: 0xf836/ripper
        /// <summary>
        /// TODO : Change to Regex
        /// Get Post ids of all Posts
        /// </summary>
        /// <param name="htmlDump">
        /// The html Dump.
        /// </param>
        /// <returns>
        /// The extract thread to posts html.
        /// </returns>
        public static List<ImageInfo> ExtractThreadtoPostsHtml(string htmlDump)
        {
            List<ImageInfo> rtnList = new List<ImageInfo>();

            const string Start = "<a name=\"post";

            string sEnd = "\" href";

            string sCopy = htmlDump;

            int iStart = 0;

            iStart = sCopy.IndexOf(Start, iStart);

            while (iStart >= 0)
            {
                int iEnd = sCopy.IndexOf(sEnd, iStart);

                string sPostId = sCopy.Substring(iStart + Start.Length, iEnd - (iStart + Start.Length));

                ImageInfo newThumbPicPool = new ImageInfo { ImageUrl = sPostId };

                // iEnd = 0;
                if (Utility.IsNumeric(sPostId) && !string.IsNullOrEmpty(sPostId))
                {
                    rtnList.Add(newThumbPicPool);
                }

                iStart = sCopy.IndexOf(Start, iStart + sEnd.Length);
            }

            return rtnList;
        }
コード例 #3
0
ファイル: ExtractHelper.cs プロジェクト: 0xf836/ripper
        /// <summary>
        /// Attempts to extract hot linked and thumb-&gt;FullScale images.
        /// </summary>
        /// <param name="strDump">
        /// The STR dump.
        /// </param>
        /// <returns>
        /// The extract images.
        /// </returns>
        public static List<ImageInfo> ExtractImages(string strDump)
        {
            List<ImageInfo> rtnList = new List<ImageInfo>();
            Hashtable rtnHashChk = new Hashtable();

            try
            {
                DataSet ds = new DataSet();

                ds.ReadXml(new StringReader(strDump));

                foreach (DataRow row in ds.Tables["Image"].Rows)
                {
                    string thumbUrl;

                    try
                    {
                        thumbUrl = row["thumb_url"].ToString();
                    }
                    catch (Exception)
                    {
                        thumbUrl = string.Empty;
                    }

                    ImageInfo newPicPool = new ImageInfo
                                               {
                                                   ImageUrl = row["main_url"].ToString(),
                                                   ThumbnailUrl = thumbUrl
                                               };

                    newPicPool.ImageUrl = Regex.Replace(newPicPool.ImageUrl, @"""", string.Empty);

                    //////////////////////////////////////////////////////////////////////////
                    if (Utility.IsImageNoneSense(newPicPool.ImageUrl))
                    {
                        continue;
                    }

                    newPicPool.ImageUrl = Utility.ReplaceHexWithAscii(newPicPool.ImageUrl);

                    // Remove anonym.to from Link if exists
                    if (newPicPool.ImageUrl.Contains("anonym.to"))
                    {
                        newPicPool.ImageUrl = newPicPool.ImageUrl.Replace("http://www.anonym.to/?", string.Empty);
                    }

                    // Remove redirect
                    if (newPicPool.ImageUrl.Contains("redirect-to"))
                    {
                        newPicPool.ImageUrl =
                            newPicPool.ImageUrl.Replace(
                                string.Format(
                                    "{0}redirect-to/?redirect=",
                                    CacheController.Instance().UserSettings.ForumURL),
                                string.Empty);
                    }

                    // Get Real Url
                    if (newPicPool.ImageUrl.Contains("/out/out.php?x="))
                    {
                        var req = (HttpWebRequest)WebRequest.Create(newPicPool.ImageUrl);

                        req.Referer = newPicPool.ImageUrl;
                        req.Timeout = 20000;

                        var res = (HttpWebResponse)req.GetResponse();

                        newPicPool.ImageUrl = res.ResponseUri.ToString();

                        res.Close();
                    }

                    if (rtnHashChk.Contains(newPicPool.ImageUrl))
                    {
                        continue;
                    }

                    rtnList.Add(newPicPool);
                    rtnHashChk.Add(newPicPool.ImageUrl, "OK");
                }
            }
            catch (Exception ex)
            {
                MessageBox.Show(string.Format("{0}\n{1}", ex.Message, ex.StackTrace));
            }

            return rtnList;
        }