Beispiel #1
0
        private void downloadPageImg(ImgInfo myImgInfo)
        {
            if (myImgInfo.imgUrl == null || myImgInfo.imgUrl.Length == 0)
            {
                return;
            }
            //MyHttpParameter.host = "i1.1100lu.xyz";
            int downloadCount = 0;

            myWriteLine("Download Task Created:" + myImgInfo.picIndex, ConsoleColor.Yellow);
            string myImgPath = savePath + myImgInfo.picIndex + "-" + myImgInfo.title + @"\";//合法目录

            if (!Directory.Exists(myImgPath))
            {
                Directory.CreateDirectory(myImgPath);                              //创建文件目录
            }
            Parallel.For(0, myImgInfo.imgUrl.Length, async(i, state) => {
                myImgInfo.imgUrl[i] = myImgInfo.imgUrl[i].IndexOf("http") != -1 ? myImgInfo.imgUrl[i] : "https:" + myImgInfo.imgUrl[i];      //url纠正
                string saveRe       = await MyHttp.getImg(myImgInfo.imgUrl[i], myImgPath, (i + 1) + "-" + myImgInfo.picIndex, myImgInfo.HP); //异步获取图片
                myWriteLine("Img Downloaded Result:" + saveRe, saveRe.IndexOf("Error:") != -1 ? ConsoleColor.Red : ConsoleColor.Green);      //检测是否发生错误
                finishImgCount++;
                if (++downloadCount == myImgInfo.imgUrl.Length)                                                                              //检测整页完成
                {
                    lock (finishListLocker) {
                        finishPageCount++;
                        OnPageFinished(null, new OnFinishedEventArgs(myImgInfo));
                    }
                }
            }
                         );
        }
Beispiel #2
0
        private void fetchImgFromContentPage(string pageUrl, string picIndex)
        {
            string cookies    = string.Empty;
            string myPageHtml = MyHttp.getHtml(pageUrl, out cookies);//获取图片url列表

            //string[] imgUrl = MyWebsiteConverter.imgReg(myPageHtml);//获取图片url
            string[] imgUrl = WebSiteConverter.imgReg(myPageHtml, ws.imgKeys);
            for (int i = 0; i < imgUrl.Length; i++)
            {
                if (imgUrl[i].StartsWith("//"))
                {
                    imgUrl[i] = "http:" + imgUrl[i];
                }
                else if (!imgUrl[i].StartsWith("http"))
                {
                    imgUrl[i] = domain + imgUrl[i];
                }
            }
            string myTitle = MyHttp.getValidFileName(MyHttp.getHtmlTitle(myPageHtml));//获取合法标题

            if (imgUrl.Length > 0)
            {
                myWriteLine("Fetch Page Successful:Index[" + picIndex + "]Title:" + myTitle);
                lock (addListLocker) {
                    ImgInfo tmpImgInfo = new ImgInfo(imgUrl, myTitle, picIndex, new MyHttp.httpParameter(cookies, "", pageUrl));
                    fetchPageCount++;               //计算获取页面数
                    fetchImgCount += imgUrl.Length; //计算获取图片数
                    OnPageFetched(null, new OnFetchedEventArgs(tmpImgInfo));
                }
            }
            else
            {
                myWriteLine("Fetch Page Invalid:Index[" + picIndex + "]Title:" + myTitle, ConsoleColor.Red);
            }
        }
Beispiel #3
0
 private void fetchImgUrlFromContentPage(string[] page)
 {
     foreach (string pageUrl in page)
     {
         do
         {
             System.Threading.Thread.Sleep(1);
         } while (fetchPageCount - finishPageCount >= MAX_WAIT_COUNT && !stopWorkFlag);
         if (stopWorkFlag)
         {
             break;
         }                                                                              //检测退出
         string myPageUrl = pageUrl.IndexOf(@"://") != -1 ? pageUrl : domain + pageUrl; //得出完整url
         string picIndex  = pageUrl.Substring(pageUrl.LastIndexOf("/") + 1);            //得出图片页面号
         if (picIndex.LastIndexOf(".") != -1)
         {
             picIndex = picIndex.Substring(0, picIndex.LastIndexOf("."));
         }
         else
         {
             picIndex = MyHttp.get16bitMd5Str(pageUrl);
         }
         if (JUMP_REPEATED_PAGE == true && allDirectiories.IndexOf(@"\" + picIndex + "-") != -1)  //跳过重复项
         {
             myWriteLine("Fetch Page Jumped:Index[" + picIndex + "]", ConsoleColor.Red);
         }
         else
         {
             fetchImgFromContentPage(myPageUrl, picIndex);
         }
     }
 }
Beispiel #4
0
        private string[] fetchContentPageUrl(string domain, string imgType, long pageIndex)
        {
            //string myUrl = MyWebsiteConverter.urlConvert(domain, imgType, pageIndex); //得出url
            string myUrl  = WebSiteConverter.getUrl(ws.urlPattern, ws.firstPageUrlPattern, ws.domain, ws.imgType, pageIndex);
            string myHtml = MyHttp.getHtml(myUrl, out cookies);                   //获取html

            //string[] myContentPageUrl = MyWebsiteConverter.pageReg(myHtml);                      //获取页面链接
            string[] myContentPageUrl = WebSiteConverter.pageReg(myHtml, ws.pageRegex);
            return(myContentPageUrl);
        }
 public string[] pageReg(string html)
 {
     return(MyHttp.regArr(html, pagePattern));
 }
 public string[] imgReg(string html)
 {
     return(MyHttp.getHtmlImgWithKey(MyHttp.getHtmlImg(html), imgKeys));
 }
Beispiel #7
0
        private void parseLinks(Uri uri)
        {
            string html          = MyHttp.getHtml(uri.AbsoluteUri);
            var    urlDictionary = new Dictionary <string, string>();
            Match  match         = Regex.Match(html, "(?i)<a .*?href=\"([^\"]+)\"[^>]*>(.*?)</a>");

            while (match.Success)
            {
                // 以 href 作为 key
                string urlKey = match.Groups[1].Value;
                // 以 text 作为 value
                string urlValue = Regex.Replace(match.Groups[2].Value, "(?i)<.*?>", string.Empty);
                urlDictionary[urlKey] = urlValue;
                match = match.NextMatch();
            }
            foreach (var item in urlDictionary)
            {
                string href = item.Key;
                string text = item.Value;

                if (!string.IsNullOrEmpty(href))
                {
                    bool canBeAdd = true;

                    if (Settings.EscapeLinks != null && Settings.EscapeLinks.Count > 0)
                    {
                        if (Settings.EscapeLinks.Any(suffix => href.EndsWith(suffix, StringComparison.OrdinalIgnoreCase)))
                        {
                            canBeAdd = false;
                        }
                    }

                    if (Settings.HrefKeywords != null && Settings.HrefKeywords.Count > 0)
                    {
                        if (!Settings.HrefKeywords.Any(href.Contains))
                        {
                            canBeAdd = false;
                        }
                    }

                    if (Settings.TextKeywords != null && Settings.TextKeywords.Count > 0)
                    {
                        if (!Settings.TextKeywords.Any(text.Contains))
                        {
                            canBeAdd = false;
                        }
                    }

                    if (canBeAdd)
                    {
                        string url = href.Replace("%3f", "?")
                                     .Replace("%3d", "=")
                                     .Replace("%2f", "/")
                                     .Replace("&amp;", "&");

                        if (string.IsNullOrEmpty(url) || url.StartsWith("#") ||
                            url.StartsWith("mailto:", StringComparison.OrdinalIgnoreCase) ||
                            url.StartsWith("javascript:", StringComparison.OrdinalIgnoreCase))
                        {
                            continue;
                        }
                        var baseUri    = uri;
                        Uri currentUri = url.StartsWith("http", StringComparison.OrdinalIgnoreCase)
                                             ? new Uri(url)
                                             : new Uri(baseUri, url);

                        url = currentUri.AbsoluteUri;
                        myWriteLine(url);
                    }
                }
            }
        }