Exemple #1
0
        public void DownloadImageLoop()
        {
            WriteLog("Download Thread is starting. id is:" + Thread.CurrentThread.ManagedThreadId);
            if (!Directory.Exists("D:\\WebDownload"))
            {
                Directory.CreateDirectory("D:\\WebDownload");
            }

            string       downurl  = "";
            DownloadInfo downinfo = null;
            int          trytag   = 0;

            while (true)
            {
                if (!AppRunning)
                {
                    break;
                }

                if (!bIsStartWork)
                {
                    Thread.CurrentThread.Suspend();
                }

                try
                {
                    if (workList.Count <= 0 && downloadList.Count <= 0)
                    {
                        if (trytag <= 5)
                        {
                            trytag++;
                            Thread.Sleep(500);
                            continue;
                        }
                        else
                        {
                            break;
                        }
                    }
                    lock (downloadList)
                    {
                        if (downloadList.Count == 0)
                        {
                            Thread.Sleep(500);
                            continue;
                        }

                        downurl  = downloadList.Dequeue();
                        downinfo = imageList[downurl];
                    }

                    var filepath = GetImageSavePath(downinfo.imageUrl);
                    // 如果文件已经存在,则认为已经下载过.直接跳过
                    if (File.Exists(filepath))
                    {
                        continue;
                    }

                    try
                    {
                        WebClient    mywebclient = new WebClient();
                        Byte[]       imgdata     = mywebclient.DownloadData(downinfo.imageUrl);
                        MemoryStream ms          = new MemoryStream(imgdata);
                        Image        img         = Image.FromStream(ms);
                        downinfo.imgSize = img.Size;
                        if (img.Width > 400 && img.Height > 400)
                        {
                            var savefolder = GetImageSaveFolder(downinfo.imageUrl);
                            if (!Directory.Exists(savefolder))
                            {
                                Directory.CreateDirectory(savefolder);
                            }

                            try
                            {
                                img.Save(filepath);
                            }
                            catch (Exception exp)
                            {
                                WriteLog("Save Image error:" + exp.Message.ToString() + "\r\nThe url is :" + downinfo.workUrl + "\r\nThe save path is:"
                                         + filepath);

                                lock (downloadList)
                                {
                                    downloadList.Enqueue(downurl);
                                }
                            }
                        }
                    }
                    catch (Exception ex)
                    {
                        WriteLog("Download image failed:" + ex.Message.ToString() + "\r\nThe URL is :" + downurl);
                    }
                }
                catch (Exception e)
                {
                    WriteLog("Download image failed:" + e.Message.ToString());
                }
            }

            WriteLog("Download Thread is end. id is:" + Thread.CurrentThread.ManagedThreadId);
        }
Exemple #2
0
        public void ParserLoop()
        {
            WriteLog("Parser Thread is starting. id is:" + Thread.CurrentThread.ManagedThreadId);
            while (true)
            {
                if (!AppRunning)
                    break;

                if (!bIsStartWork)
                {
                    Thread.CurrentThread.Suspend();
                }

                if (workList.Count <= 0)
                {
                    Thread.Sleep(500);
                    continue;
                }
                string workurl = "";
                lock (workList)
                {
                    if (workList.Count <= 0)
                    {
                        Thread.Sleep(500);
                        continue;
                    }
                    workurl = workList.Dequeue();
                }

                // 全局相对路径,解析当前url的根路径
                string orgbaseurl = "";
                if (workurl.IndexOf('/', 8) == -1)
                {
                    orgbaseurl = workurl;
                }
                else
                {
                    orgbaseurl = workurl.Substring(0, workurl.IndexOf('/', 8));
                }

                try
                {
                    int end = -1;
                    int start = -1;
                    WebClient MyWebClient = new WebClient();

                    MyWebClient.Credentials = CredentialCache.DefaultCredentials;//获取或设置用于向Internet资源的请求进行身份验证的网络凭据

                    string pageHtml = "";

                    pageHtml = GetHtml(workurl, null);

                    pageHtml = pageHtml.Replace("<", "\n<");
                    string[] listtemp = Regex.Split(pageHtml, "\n", RegexOptions.IgnoreCase);

                    List<string> list = new List<string>();
                    foreach (var item in listtemp)
                    {
                        var temp = item.Replace("\t", "");
                        temp = temp.Trim();
                        if (temp.Length > 5)
                            list.Add(temp);
                    }

                    string title = "no title";

                    foreach (var item in list)
                    {
                        string splitstring = "";

                        bool bParserIMG = false;
                        if (-1 != item.IndexOf("<img", StringComparison.OrdinalIgnoreCase))
                        {
                            splitstring = "src=";
                            bParserIMG = true;
                        }
                        else if (-1 != item.IndexOf("<a", StringComparison.OrdinalIgnoreCase))
                        {
                            splitstring = "href=";
                            bParserIMG = false;
                        }
                        else if (-1 != item.IndexOf("<title", StringComparison.OrdinalIgnoreCase))
                        {
                            start = item.IndexOf("<title>", StringComparison.OrdinalIgnoreCase) + 7;
                            end = item.IndexOf(@"</title>", StringComparison.OrdinalIgnoreCase);
                            if (end == -1)
                                end = item.Length;
                            title = item.Substring(start, end - start);
                            continue;
                        }
                        else
                        {
                            continue;
                        }

                        // parser the url

                        if (-1 == item.IndexOf(splitstring, StringComparison.OrdinalIgnoreCase))
                            continue;

                        string url = "";

                        string[] resu = item.Split(' ');
                        foreach (var subitem in resu)
                        {
                            if (-1 != subitem.IndexOf(splitstring, StringComparison.OrdinalIgnoreCase))
                            {
                                url = subitem.Substring(subitem.IndexOf("=") + 1);
                                break;
                            }
                        }

                        url = GetRealURL(url, orgbaseurl, workurl);

                        if (!bParserIMG && url.IndexOf(baseurl, StringComparison.OrdinalIgnoreCase) == -1)//&& url.IndexOf(orgbaseurl) == -1 )
                        {
                            //WriteLog("The url is not current site:" + url);
                            continue;
                        }

                        string workpage = config.workUrl.Substring(0, config.workUrl.LastIndexOf('.'));
                        if (!bParserIMG && config.OnlyCurPage && url.IndexOf(workpage) == -1)
                        {
                            continue;
                        }

                        // some a tag's url is a image. check it.
                        if (bParserIMG == false)
                        {
                            string temp = url.Substring(url.Length - 4).ToLower();
                            if (temp == "jpeg" || temp == ".jpg" || temp == ".png" || temp == ".bmp" || temp == ".gif")
                            {
                                url = url.Substring(url.LastIndexOf("://") - 4);
                                bParserIMG = true;
                            }
                        }

                        if (bParserIMG)
                        {
                            if (!imageList.ContainsKey(url))
                            {
                                string samiltitle = title;
                                // 处理title
                                foreach (var split in config.pageSplit)
                                {
                                    Regex rg = new Regex(split);
                                    var res = rg.Match(title);
                                    if (res.Value != "")
                                    {
                                        samiltitle = Regex.Replace(title, split, "");
                                        break;
                                    }
                                }
                                // 去除多页时通常使用的分隔
                                samiltitle = samiltitle.Replace("-", "").Trim();

                                // 准备信息
                                DownloadInfo info = new DownloadInfo();
                                info.imageUrl = url;
                                info.baseUrl = orgbaseurl;
                                info.title = title;
                                info.samilTitle = samiltitle;
                                info.workUrl = workurl;
                                imageList.Add(url, info);
                                downloadList.Enqueue(url);

                                lock (addToViewList)
                                {
                                    addToViewList.Add(url);
                                }
                            }
                        }
                        else
                        {
                            //lock(linkList)
                            {
                                if (config.UrlKeyWords != null && -1 == url.IndexOf(config.UrlKeyWords))
                                    continue;
                                if (!linkList.ContainsKey(url))
                                {
                                    linkList.Add(url, null);
                                    lock (workList)
                                    {
                                        workList.Enqueue(url);
                                    }
                                }
                            }

                        }

                    }
                }
                catch (Exception e)
                {
                    WriteLog("Parser fialed:" + e.ToString());
                }
            }
            WriteLog("Parser Thread is end. id is:" + Thread.CurrentThread.ManagedThreadId);
        }
Exemple #3
0
        public void ParserLoop()
        {
            WriteLog("Parser Thread is starting. id is:" + Thread.CurrentThread.ManagedThreadId);
            while (true)
            {
                if (!AppRunning)
                {
                    break;
                }

                if (!bIsStartWork)
                {
                    Thread.CurrentThread.Suspend();
                }

                if (workList.Count <= 0)
                {
                    Thread.Sleep(500);
                    continue;
                }
                string workurl = "";
                lock (workList)
                {
                    if (workList.Count <= 0)
                    {
                        Thread.Sleep(500);
                        continue;
                    }
                    workurl = workList.Dequeue();
                }

                // 全局相对路径,解析当前url的根路径
                string orgbaseurl = "";
                if (workurl.IndexOf('/', 8) == -1)
                {
                    orgbaseurl = workurl;
                }
                else
                {
                    orgbaseurl = workurl.Substring(0, workurl.IndexOf('/', 8));
                }

                try
                {
                    int       end         = -1;
                    int       start       = -1;
                    WebClient MyWebClient = new WebClient();

                    MyWebClient.Credentials = CredentialCache.DefaultCredentials;//获取或设置用于向Internet资源的请求进行身份验证的网络凭据

                    string pageHtml = "";


                    pageHtml = GetHtml(workurl, null);

                    pageHtml = pageHtml.Replace("<", "\n<");
                    string[] listtemp = Regex.Split(pageHtml, "\n", RegexOptions.IgnoreCase);

                    List <string> list = new List <string>();
                    foreach (var item in listtemp)
                    {
                        var temp = item.Replace("\t", "");
                        temp = temp.Trim();
                        if (temp.Length > 5)
                        {
                            list.Add(temp);
                        }
                    }

                    string title = "no title";


                    foreach (var item in list)
                    {
                        string splitstring = "";

                        bool bParserIMG = false;
                        if (-1 != item.IndexOf("<img", StringComparison.OrdinalIgnoreCase))
                        {
                            splitstring = "src=";
                            bParserIMG  = true;
                        }
                        else if (-1 != item.IndexOf("<a", StringComparison.OrdinalIgnoreCase))
                        {
                            splitstring = "href=";
                            bParserIMG  = false;
                        }
                        else if (-1 != item.IndexOf("<title", StringComparison.OrdinalIgnoreCase))
                        {
                            start = item.IndexOf("<title>", StringComparison.OrdinalIgnoreCase) + 7;
                            end   = item.IndexOf(@"</title>", StringComparison.OrdinalIgnoreCase);
                            if (end == -1)
                            {
                                end = item.Length;
                            }
                            title = item.Substring(start, end - start);
                            continue;
                        }
                        else
                        {
                            continue;
                        }


                        // parser the url


                        if (-1 == item.IndexOf(splitstring, StringComparison.OrdinalIgnoreCase))
                        {
                            continue;
                        }

                        string url = "";

                        string[] resu = item.Split(' ');
                        foreach (var subitem in resu)
                        {
                            if (-1 != subitem.IndexOf(splitstring, StringComparison.OrdinalIgnoreCase))
                            {
                                url = subitem.Substring(subitem.IndexOf("=") + 1);
                                break;
                            }
                        }

                        url = GetRealURL(url, orgbaseurl, workurl);

                        if (!bParserIMG && url.IndexOf(baseurl, StringComparison.OrdinalIgnoreCase) == -1)//&& url.IndexOf(orgbaseurl) == -1 )
                        {
                            //WriteLog("The url is not current site:" + url);
                            continue;
                        }

                        string workpage = config.workUrl.Substring(0, config.workUrl.LastIndexOf('.'));
                        if (!bParserIMG && config.OnlyCurPage && url.IndexOf(workpage) == -1)
                        {
                            continue;
                        }

                        // some a tag's url is a image. check it.
                        if (bParserIMG == false)
                        {
                            string temp = url.Substring(url.Length - 4).ToLower();
                            if (temp == "jpeg" || temp == ".jpg" || temp == ".png" || temp == ".bmp" || temp == ".gif")
                            {
                                url        = url.Substring(url.LastIndexOf("://") - 4);
                                bParserIMG = true;
                            }
                        }


                        if (bParserIMG)
                        {
                            if (!imageList.ContainsKey(url))
                            {
                                string samiltitle = title;
                                // 处理title
                                foreach (var split in config.pageSplit)
                                {
                                    Regex rg  = new Regex(split);
                                    var   res = rg.Match(title);
                                    if (res.Value != "")
                                    {
                                        samiltitle = Regex.Replace(title, split, "");
                                        break;
                                    }
                                }
                                // 去除多页时通常使用的分隔
                                samiltitle = samiltitle.Replace("-", "").Trim();

                                // 准备信息
                                DownloadInfo info = new DownloadInfo();
                                info.imageUrl   = url;
                                info.baseUrl    = orgbaseurl;
                                info.title      = title;
                                info.samilTitle = samiltitle;
                                info.workUrl    = workurl;
                                imageList.Add(url, info);
                                downloadList.Enqueue(url);

                                lock (addToViewList)
                                {
                                    addToViewList.Add(url);
                                }
                            }
                        }
                        else
                        {
                            //lock(linkList)
                            {
                                if (config.UrlKeyWords != null && -1 == url.IndexOf(config.UrlKeyWords))
                                {
                                    continue;
                                }
                                if (!linkList.ContainsKey(url))
                                {
                                    linkList.Add(url, null);
                                    lock (workList)
                                    {
                                        workList.Enqueue(url);
                                    }
                                }
                            }
                        }
                    }
                }
                catch (Exception e)
                {
                    WriteLog("Parser fialed:" + e.ToString());
                }
            }
            WriteLog("Parser Thread is end. id is:" + Thread.CurrentThread.ManagedThreadId);
        }