コード例 #1
0
        private void fetchImgFromContentPage(string pageUrl, string picIndex)
        {
            string cookies    = string.Empty;
            string myPageHtml = MyHttp.getHtml(pageUrl, out cookies);//获取图片url列表

            //string[] imgUrl = MyWebsiteConverter.imgReg(myPageHtml);//获取图片url
            string[] imgUrl = WebSiteConverter.imgReg(myPageHtml, ws.imgKeys);
            for (int i = 0; i < imgUrl.Length; i++)
            {
                if (imgUrl[i].StartsWith("//"))
                {
                    imgUrl[i] = "http:" + imgUrl[i];
                }
                else if (!imgUrl[i].StartsWith("http"))
                {
                    imgUrl[i] = domain + imgUrl[i];
                }
            }
            string myTitle = MyHttp.getValidFileName(MyHttp.getHtmlTitle(myPageHtml));//获取合法标题

            if (imgUrl.Length > 0)
            {
                myWriteLine("Fetch Page Successful:Index[" + picIndex + "]Title:" + myTitle);
                lock (addListLocker) {
                    ImgInfo tmpImgInfo = new ImgInfo(imgUrl, myTitle, picIndex, new MyHttp.httpParameter(cookies, "", pageUrl));
                    fetchPageCount++;               //计算获取页面数
                    fetchImgCount += imgUrl.Length; //计算获取图片数
                    OnPageFetched(null, new OnFetchedEventArgs(tmpImgInfo));
                }
            }
            else
            {
                myWriteLine("Fetch Page Invalid:Index[" + picIndex + "]Title:" + myTitle, ConsoleColor.Red);
            }
        }
コード例 #2
0
        private string[] fetchContentPageUrl(string domain, string imgType, long pageIndex)
        {
            //string myUrl = MyWebsiteConverter.urlConvert(domain, imgType, pageIndex); //得出url
            string myUrl  = WebSiteConverter.getUrl(ws.urlPattern, ws.firstPageUrlPattern, ws.domain, ws.imgType, pageIndex);
            string myHtml = MyHttp.getHtml(myUrl, out cookies);                   //获取html

            //string[] myContentPageUrl = MyWebsiteConverter.pageReg(myHtml);                      //获取页面链接
            string[] myContentPageUrl = WebSiteConverter.pageReg(myHtml, ws.pageRegex);
            return(myContentPageUrl);
        }
コード例 #3
0
        private void parseLinks(Uri uri)
        {
            string html          = MyHttp.getHtml(uri.AbsoluteUri);
            var    urlDictionary = new Dictionary <string, string>();
            Match  match         = Regex.Match(html, "(?i)<a .*?href=\"([^\"]+)\"[^>]*>(.*?)</a>");

            while (match.Success)
            {
                // 以 href 作为 key
                string urlKey = match.Groups[1].Value;
                // 以 text 作为 value
                string urlValue = Regex.Replace(match.Groups[2].Value, "(?i)<.*?>", string.Empty);
                urlDictionary[urlKey] = urlValue;
                match = match.NextMatch();
            }
            foreach (var item in urlDictionary)
            {
                string href = item.Key;
                string text = item.Value;

                if (!string.IsNullOrEmpty(href))
                {
                    bool canBeAdd = true;

                    if (Settings.EscapeLinks != null && Settings.EscapeLinks.Count > 0)
                    {
                        if (Settings.EscapeLinks.Any(suffix => href.EndsWith(suffix, StringComparison.OrdinalIgnoreCase)))
                        {
                            canBeAdd = false;
                        }
                    }

                    if (Settings.HrefKeywords != null && Settings.HrefKeywords.Count > 0)
                    {
                        if (!Settings.HrefKeywords.Any(href.Contains))
                        {
                            canBeAdd = false;
                        }
                    }

                    if (Settings.TextKeywords != null && Settings.TextKeywords.Count > 0)
                    {
                        if (!Settings.TextKeywords.Any(text.Contains))
                        {
                            canBeAdd = false;
                        }
                    }

                    if (canBeAdd)
                    {
                        string url = href.Replace("%3f", "?")
                                     .Replace("%3d", "=")
                                     .Replace("%2f", "/")
                                     .Replace("&amp;", "&");

                        if (string.IsNullOrEmpty(url) || url.StartsWith("#") ||
                            url.StartsWith("mailto:", StringComparison.OrdinalIgnoreCase) ||
                            url.StartsWith("javascript:", StringComparison.OrdinalIgnoreCase))
                        {
                            continue;
                        }
                        var baseUri    = uri;
                        Uri currentUri = url.StartsWith("http", StringComparison.OrdinalIgnoreCase)
                                             ? new Uri(url)
                                             : new Uri(baseUri, url);

                        url = currentUri.AbsoluteUri;
                        myWriteLine(url);
                    }
                }
            }
        }