Пример #1
0
        /// <summary>
        /// Extract the Current Post Title if there is any
        /// if not use PostId As Title
        /// </summary>
        /// <param name="content">
        /// The content.
        /// </param>
        /// <param name="url">
        /// The url.
        /// </param>
        /// <returns>
        /// The extract post title from html.
        /// </returns>
        public string ExtractPostTitleFromHtml(string content, string url)
        {
            var postId = url.Substring(url.IndexOf("p=", StringComparison.Ordinal) + 2);

            var check =
                string.Format(
                    @"<h2 class=\""title icon\"">\r\n\t\t\t\t\t(?<inner>[^\r]*)\r\n\t\t\t\t</h2>\r\n\t\t\t\t\r\n\r\n\r\n\t\t\t\t\t\t\r\n\t\t\t\t\t\t\r\n\t\t\t\t<div class=\""content\"">\r\n\t\t\t\t\t<div id=\""post_message_{0}\"">",
                    postId);

            var check2 =
                string.Format(
                    @"<h2 class=\""title icon\"">\r\n\t\t\t\t\t(?<inner>[^\r]*)\r\n\t\t\t\t</h2>\r\n\t\t\t\t\r\n\r\n\r\n\t\t\t\t\t\t\r\n\t\t\t\t\t\t\t\r\n\t\t\t\t\t\t\t\r\n\t\t\t\t\t\t\r\n\t\t\t\t\t\t\r\n\t\t\t\t\t\t\t\r\n\t\t\t\t\t\t\r\n\t\t\t\t<div class=\""content\"">\r\n\t\t\t\t\t<div id=\""post_message_{0}\"">",
                    postId);

            var match = Regex.Match(content, check, RegexOptions.Compiled);

            var postTitle = string.Empty;

            if (!match.Success)
            {
                match = Regex.Match(content, check2, RegexOptions.Compiled);

                if (!match.Success)
                {
                    return(postTitle);
                }
            }

            postTitle = match.Groups["inner"].Value.Trim();

            if (postTitle == string.Empty)
            {
                postTitle = string.Format("post# {0}", postId);
            }
            else if (postTitle == string.Format("Re: {0}", this.ExtractTopicTitleFromHtml(content)))
            {
                postTitle = string.Format("post# {0}", postId);
            }

            // Remove Topic Icons if found
            if (postTitle.Contains("<img"))
            {
                postTitle = postTitle.Substring(postTitle.IndexOf(" /> ", StringComparison.Ordinal) + 4);
            }

            return(Utility.ReplaceHexWithAscii(postTitle));
        }
Пример #2
0
        /// <summary>
        /// Attempts to extract hot linked and thumb-&gt;FullScale images.
        /// </summary>
        /// <param name="htmlDump">
        /// The html Dump.
        /// </param>
        /// <param name="postId">
        /// The Post Id.
        /// </param>
        /// <returns>
        /// The extract images links html.
        /// </returns>
        public static List <ImageInfo> ExtractImagesLinksHtml(string htmlDump, string postId)
        {
            if (!string.IsNullOrEmpty(postId) && postId.StartsWith("http://"))
            {
                postId = postId.Substring(postId.IndexOf("#post") + 5);
            }

            htmlDump = htmlDump.Replace("&amp;", "&");

            // use only message content
            var sMessageStart = string.Format("<div id=\"post_message_{0}\">", postId);
            var sMessageEnd   = "</blockquote>";

            var iStart = htmlDump.IndexOf(sMessageStart);

            iStart += sMessageStart.Length;

            var iEnd = htmlDump.IndexOf(sMessageEnd, iStart);

            htmlDump = htmlDump.Substring(iStart, iEnd - iStart);

            ///////////////////////////////////////////////

            // Parse all Links <a>
            var rtnList =
                LinkFinder.ListAllLinks(htmlDump).Select(
                    link =>
                    new ImageInfo
            {
                ImageUrl     = RemoveRedirectLink(Utility.ReplaceHexWithAscii(link.Href)),
                ThumbnailUrl = Utility.ReplaceHexWithAscii(link.Text)
            }).Where(newPicPoolItem => !Utility.IsImageNoneSense(newPicPoolItem.ImageUrl) && !Utility.IsImageNoneSense(newPicPoolItem.ThumbnailUrl)).ToList();

            // Parse all Image <a>
            rtnList.AddRange(
                LinkFinder.ListAllImages(htmlDump).Select(
                    link =>
                    new ImageInfo
            {
                ImageUrl     = RemoveRedirectLink(Utility.ReplaceHexWithAscii(link.Href)),
                ThumbnailUrl = Utility.ReplaceHexWithAscii(link.Text)
            }));

            return(rtnList);
        }
Пример #3
0
        /// <summary>
        /// Extracts links leading to other threads and posts for indices crawling.
        /// </summary>
        /// <param name="xmlDump">
        /// The XML dump.
        /// </param>
        /// <returns>
        /// The extract rip URL's.
        /// </returns>
        public static List <ImageInfo> ExtractRiPUrls(string xmlDump)
        {
            var rtnList    = new List <ImageInfo>();
            var rtnHashChk = new Hashtable();

            try
            {
                var ds = new DataSet();

                ds.ReadXml(new StringReader(xmlDump));

                foreach (var newPicPool in
                         from DataRow row in ds.Tables["Image"].Rows
                         select
                         new ImageInfo
                {
                    ImageUrl = row["main_url"].ToString(),
                    ThumbnailUrl =
                        row["thumb_url"] != null ? row["thumb_url"].ToString() : string.Empty
                })
                {
                    newPicPool.ImageUrl = Utility.ReplaceHexWithAscii(newPicPool.ImageUrl);

                    if (rtnHashChk.Contains(newPicPool.ImageUrl))
                    {
                        continue;
                    }

                    rtnList.Add(newPicPool);
                    rtnHashChk.Add(newPicPool.ImageUrl, "OK");
                }
            }
            catch (Exception ex)
            {
                MessageBox.Show($"{ex.Message}\n{ex.StackTrace}");
            }

            return(rtnList);
        }
Пример #4
0
        /// <summary>
        /// Extracts the thread to posts.
        /// </summary>
        /// <param name="xmlDump">
        /// The XML dump.
        /// </param>
        /// <returns>
        /// The extract thread to posts.
        /// </returns>
        public static List <ImageInfo> ExtractThreadtoPosts(string xmlDump)
        {
            var rtnList    = new List <ImageInfo>();
            var rtnHashChk = new Hashtable();

            try
            {
                var ds = new DataSet();

                ds.ReadXml(new StringReader(xmlDump));

                foreach (var newPicPool in
                         ds.Tables["post"].Rows.Cast <DataRow>()
                         .Where(row => row["id"] != null)
                         .Select(row => new ImageInfo {
                    ImageUrl = row["id"].ToString()
                }))
                {
                    newPicPool.ImageUrl = Utility.ReplaceHexWithAscii(newPicPool.ImageUrl);

                    if (rtnHashChk.Contains(newPicPool.ImageUrl))
                    {
                        continue;
                    }

                    rtnList.Add(newPicPool);
                    rtnHashChk.Add(newPicPool.ImageUrl, "OK");
                }
            }
            catch (Exception ex)
            {
                Utility.SaveOnCrash(xmlDump, ex.StackTrace, null);
            }

            return(rtnList);
        }
Пример #5
0
        protected override bool DoDownload()
        {
            var strImgURL = this.ImageLinkURL;

            if (this.EventTable.ContainsKey(strImgURL))
            {
                return(true);
            }

            try
            {
                if (!Directory.Exists(this.SavePath))
                {
                    Directory.CreateDirectory(this.SavePath);
                }
            }
            catch (IOException ex)
            {
                // MainForm.DeleteMessage = ex.Message;
                // MainForm.Delete = true;
                return(false);
            }

            var strFilePath = strImgURL.Substring(strImgURL.LastIndexOf("/") + 1).Replace(".html", string.Empty);

            strFilePath = Path.Combine(this.SavePath, Utility.RemoveIllegalCharecters(strFilePath));

            var CCObj = new CacheObject();

            CCObj.IsDownloaded = false;
            CCObj.FilePath     = strFilePath;
            CCObj.Url          = strImgURL;
            try
            {
                this.EventTable.Add(strImgURL, CCObj);
            }
            catch (ThreadAbortException)
            {
                return(true);
            }
            catch (Exception)
            {
                if (this.EventTable.ContainsKey(strImgURL))
                {
                    return(false);
                }
                else
                {
                    this.EventTable.Add(strImgURL, CCObj);
                }
            }

            var strIVPage = this.GetImageHostPage(ref strImgURL);

            if (strIVPage.Length < 10)
            {
                return(false);
            }

            var iStartIMG = 0;
            var iEndSRC   = 0;

            iStartIMG = strIVPage.IndexOf("<img id=\"thepic\" name=\"thisimage\" src=\"");

            if (iStartIMG < 0)
            {
                return(false);
            }

            iStartIMG += 39;

            iEndSRC = strIVPage.IndexOf("\" width", iStartIMG);

            if (iEndSRC < 0)
            {
                return(false);
            }

            var strNewURL = Utility.ReplaceHexWithAscii(strIVPage.Substring(iStartIMG, iEndSRC - iStartIMG));

            //////////////////////////////////////////////////////////////////////////
            var NewAlteredPath = Utility.GetSuitableName(strFilePath);

            if (strFilePath != NewAlteredPath)
            {
                strFilePath = NewAlteredPath;
                ((CacheObject)this.EventTable[this.ImageLinkURL]).FilePath = strFilePath;
            }

            try
            {
                var client = new WebClient();
                client.Headers.Add("Referer: " + strImgURL);
                client.Headers.Add(
                    "User-Agent: Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US; rv:1.7.10) Gecko/20050716 Firefox/1.0.6");
                client.DownloadFile(strNewURL, strFilePath);
                client.Dispose();
            }
            catch (ThreadAbortException)
            {
                ((CacheObject)this.EventTable[strImgURL]).IsDownloaded = false;
                ThreadManager.GetInstance().RemoveThreadbyId(this.ImageLinkURL);

                return(true);
            }
            catch (IOException ex)
            {
                // MainForm.DeleteMessage = ex.Message;
                // MainForm.Delete = true;
                ((CacheObject)this.EventTable[strImgURL]).IsDownloaded = false;
                ThreadManager.GetInstance().RemoveThreadbyId(this.ImageLinkURL);

                return(true);
            }
            catch (WebException)
            {
                ((CacheObject)this.EventTable[strImgURL]).IsDownloaded = false;
                ThreadManager.GetInstance().RemoveThreadbyId(this.ImageLinkURL);

                return(false);
            }

            ((CacheObject)this.EventTable[this.ImageLinkURL]).IsDownloaded = true;

            // CacheController.GetInstance().u_s_LastPic = ((CacheObject)eventTable[mstrURL]).FilePath;
            CacheController.Instance().LastPic =
                ((CacheObject)this.EventTable[this.ImageLinkURL]).FilePath = strFilePath;

            return(true);
        }
Пример #6
0
        /// <summary>
        /// Attempts to extract hot linked and thumb-&gt;FullScale images.
        /// </summary>
        /// <param name="htmlDump">The html Dump.</param>
        /// <param name="postId">The post identifier.</param>
        /// <returns>
        /// The extract attachment images html.
        /// </returns>
        public static List <ImageInfo> ExtractAttachmentImagesHtml(string htmlDump, string postId)
        {
            var rtnList = new List <ImageInfo>();

            htmlDump = htmlDump.Replace("&amp;", "&");

            var start = "<div class=\"attachments\">";
            var end   = "<!-- / attachments -->";

            // use only message content
            var iStart = htmlDump.IndexOf(start, System.StringComparison.Ordinal);

            if (iStart < 0)
            {
                // fix post id
                if (postId.Contains("#post"))
                {
                    postId = postId.Substring(postId.IndexOf("#post", System.StringComparison.Ordinal) + 5);
                }


                start = string.Format("<div id=\"post_message_{0}\">", postId);
                end   = "</blockquote>";

                iStart = htmlDump.IndexOf(start, System.StringComparison.Ordinal);

                if (iStart < 0)
                {
                    // Return Empty List
                    return(rtnList);
                }

                iStart += start.Length;

                var startDump = htmlDump.Substring(iStart);

                var iEnd = startDump.IndexOf(end, System.StringComparison.Ordinal);

                if (iEnd > 0)
                {
                    htmlDump = startDump.Remove(iEnd);
                }
            }
            else
            {
                iStart += start.Length;

                var iEnd = htmlDump.IndexOf(end, System.StringComparison.Ordinal);

                if (iEnd > 0)
                {
                    htmlDump = htmlDump.Substring(iStart, iEnd - iStart);
                }
            }

            ///////////////////////////////////////////////
            rtnList.AddRange(
                LinkFinder.ListAllLinks(htmlDump)
                .Select(
                    link =>
                    new ImageInfo
            {
                ImageUrl =
                    link.Href.StartsWith("http://")
                                        ? link.Href
                                        : CacheController.Instance().UserSettings.CurrentForumUrl
                    + Utility.ReplaceHexWithAscii(link.Href),
                ThumbnailUrl = string.Empty
            })
                .Where(newPicPoolItem => !Utility.IsImageNoneSense(newPicPoolItem.ImageUrl)));

            return(rtnList);
        }
Пример #7
0
        /// <summary>
        /// Extract the Current Post Title if there is any
        /// if not use PostId As Title
        /// </summary>
        /// <param name="content">
        /// The content.
        /// </param>
        /// <param name="url">
        /// The url.
        /// </param>
        /// <returns>
        /// The extract post title from html.
        /// </returns>
        public string ExtractPostTitleFromHtml(string content, string url)
        {
            string postTitle;

            ////////////////////////////////////
            // Extract Current Post first
            var sPostId = url.Substring(url.IndexOf("#post") + 5);

            // use only message content
            var sMessageStart = string.Format(
                "<li class=\"postbitlegacy postbitim postcontainer\" id=\"post_{0}\">",
                sPostId);
            const string MessageEnd = "</blockquote>";

            var iStart = content.IndexOf(sMessageStart);

            iStart += sMessageStart.Length;

            var iEnd = content.IndexOf(MessageEnd, iStart);

            var pageContent = content.Substring(iStart, iEnd - iStart);

            /////////////////////////////////
            const string TitleStart = "<h2 class=\"title icon\">";

            var iTitleStart = pageContent.IndexOf(TitleStart);

            iTitleStart += TitleStart.Length;

            var iTitleEnd = pageContent.IndexOf("</h2>", iTitleStart);

            try
            {
                postTitle =
                    pageContent.Substring(iTitleStart, iTitleEnd - iTitleStart)
                    .Replace("\r", string.Empty)
                    .Replace("\t", string.Empty)
                    .Replace("\n", string.Empty);

                // Remove Post Icon
                if (postTitle.StartsWith("<img src="))
                {
                    postTitle = postTitle.Substring(postTitle.IndexOf("/>") + 3);
                }
            }
            catch (Exception)
            {
                postTitle = string.Empty;
            }

            if (string.IsNullOrEmpty(postTitle))
            {
                postTitle = string.Format("post# {0}", url.Substring(url.IndexOf(@"#post") + 5));
            }
            else
            {
                return(Utility.ReplaceHexWithAscii(postTitle));
            }

            return(Utility.ReplaceHexWithAscii(postTitle));
        }
Пример #8
0
        /// <summary>
        /// Attempts to extract hot linked and thumb-&gt;FullScale images.
        /// </summary>
        /// <param name="strDump">
        /// The STR dump.
        /// </param>
        /// <returns>
        /// The extract images.
        /// </returns>
        public static List <ImageInfo> ExtractImages(string strDump)
        {
            var rtnList    = new List <ImageInfo>();
            var rtnHashChk = new Hashtable();

            try
            {
                var ds = new DataSet();

                ds.ReadXml(new StringReader(strDump));

                foreach (DataRow row in ds.Tables["Image"].Rows)
                {
                    string thumbUrl;

                    try
                    {
                        thumbUrl = row["thumb_url"].ToString();
                    }
                    catch (Exception)
                    {
                        thumbUrl = string.Empty;
                    }

                    var newPicPool = new ImageInfo
                    {
                        ImageUrl     = row["main_url"].ToString(),
                        ThumbnailUrl = thumbUrl
                    };

                    newPicPool.ImageUrl = Regex.Replace(newPicPool.ImageUrl, @"""", string.Empty);

                    //////////////////////////////////////////////////////////////////////////
                    if (Utility.IsImageNoneSense(newPicPool.ImageUrl))
                    {
                        continue;
                    }

                    newPicPool.ImageUrl = Utility.ReplaceHexWithAscii(newPicPool.ImageUrl);

                    // Remove anonym.to from Link if exists
                    if (newPicPool.ImageUrl.Contains("anonym.to"))
                    {
                        newPicPool.ImageUrl = newPicPool.ImageUrl.Replace("http://www.anonym.to/?", string.Empty);
                    }

                    // Remove redirect
                    if (newPicPool.ImageUrl.Contains("redirect-to"))
                    {
                        newPicPool.ImageUrl =
                            newPicPool.ImageUrl.Replace(
                                $"{CacheController.Instance().UserSettings.ForumURL}redirect-to/?redirect=",
                                string.Empty);
                    }

                    // Get Real Url
                    if (newPicPool.ImageUrl.Contains("/out/out.php?x="))
                    {
                        var req = (HttpWebRequest)WebRequest.Create(newPicPool.ImageUrl);

                        req.Referer = newPicPool.ImageUrl;
                        req.Timeout = 20000;

                        var res = (HttpWebResponse)req.GetResponse();

                        newPicPool.ImageUrl = res.ResponseUri.ToString();

                        res.Close();
                    }

                    if (rtnHashChk.Contains(newPicPool.ImageUrl))
                    {
                        continue;
                    }

                    rtnList.Add(newPicPool);
                    rtnHashChk.Add(newPicPool.ImageUrl, "OK");
                }
            }
            catch (Exception ex)
            {
                MessageBox.Show($"{ex.Message}\n{ex.StackTrace}");
            }

            return(rtnList);
        }