Example #1
0
        public void testPage()
        {
            String page = PageLoader.Download("http://news.ifeng.com/toprank/day/", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)", "utf-8");

            Assert.IsNotEmpty(page);
            Assert.Greater(page.IndexOf("点击排行榜"), 0);
        }
Example #2
0
        //#region 判断是否为土豆视频链接
        ///// <summary>
        ///// 判断是否为土豆视频链接
        ///// </summary>
        ///// <returns></returns>
        //public static bool IsTuDouVideoUrl(string url)
        //{
        //    if (url.StartsWith("http://www.tudou.com/programs/view/"))
        //    {
        //        return true;
        //    }
        //    else
        //    {
        //        return false;
        //    }
        //}
        //#endregion

        //#region 得到土豆视频的视频编码
        ///// <summary>
        ///// 得到土豆视频的视频编码
        ///// </summary>
        //public static string GetTuDouVideoItemCode(string url)
        //{
        //    String vid = "";
        //    //if (StringPlus.IsNullOrEmpty(url) || !IsTuDouVideoUrl(url))
        //    //{
        //    //    return "";
        //    //}
        //    if (StringPlus.IsNullOrEmpty(url))
        //    {
        //        return "";
        //    }
        //    vid = StringPlus.TrimStart(url, "http://www.tudou.com/programs/view/");
        //    if (StringPlus.IsNullOrEmpty(vid))
        //    {
        //        return "";
        //    }
        //    int FirstPoint = vid.IndexOf("/");
        //    if (FirstPoint > 0)
        //    {
        //        vid = vid.Substring(0, FirstPoint);
        //    }
        //    return vid;
        //}
        //#endregion

        //#region 得到土豆视频信息

        ///// <summary>
        ///// 得到土豆视频信息
        ///// </summary>
        ///// <param name="url"></param>
        ///// <returns></returns>
        //public static TuDouInfo GetTuDouInfo(string url)
        //{
        //    string itemCode = GetTuDouVideoItemCode(url);
        //    if (StringPlus.IsNullOrEmpty(itemCode))
        //    {
        //        return null;
        //    }

        //    string jsonData = PageLoader.Download(string.Format(tuDouJsonDataApiUrl, tuDouAppKey, itemCode));
        //    if (StringPlus.IsNullOrEmpty(jsonData))
        //    {
        //        return null;
        //    }

        //    string strStart = "{\"multiResult\":{\"results\":[";
        //    if (jsonData.StartsWith(strStart))
        //    {
        //        jsonData = jsonData.Replace(strStart, "");
        //    }

        //    string strEnd = "]}}";
        //    if (jsonData.EndsWith(strEnd))
        //    {
        //        int FirstPoint = jsonData.LastIndexOf(strEnd);
        //        if (FirstPoint > 0)
        //        {
        //            jsonData = jsonData.Substring(0, FirstPoint);
        //        }
        //    }

        //    TuDouInfo info = (TuDouInfo)Jayrock.Json.Conversion.JsonConvert.Import(typeof(TuDouInfo), jsonData);

        //    return info;
        //}
        //#endregion

        #region 判断是否是酷6视频链接
        /// <summary>
        /// 判断是否是酷6视频链接
        /// </summary>
        /// <returns></returns>
        public static bool IsKu6VideoUrl(string url)
        {
            string xmlData = PageLoader.Download(string.Format(ku6XmlDataApiUrl, url));

            if (!string.IsNullOrEmpty(xmlData))
            {
                XmlDocument xmldoc = new XmlDocument();

                xmldoc.LoadXml(xmlData);

                XmlNode xmlNodeType = xmldoc.SelectSingleNode("root/result");

                int type = -1;
                if (null != xmlNodeType && null != xmlNodeType.Attributes.GetNamedItem("type"))
                {
                    type = Convert.ToInt32(xmlNodeType.Attributes.GetNamedItem("type").Value);
                }
                else
                {
                    return(false);
                }
                if (type == -1)
                {
                    return(false);
                }
                else
                {
                    return(true);
                }
            }
            else
            {
                return(false);
            }
        }
Example #3
0
        public VideoInfo GetInfo(string playUrl)
        {
            VideoInfo vi = new VideoInfo();

            vi.PlayUrl = playUrl;

            try {
                String pageBody = PageLoader.Download(playUrl);

                Match  mt      = Regex.Match(pageBody, "video : {(" + "." + "+?)\\}[^,]", RegexOptions.Singleline);
                String strJson = "{" + mt.Groups[1].Value + "}";

                Dictionary <String, Object> dic = JSON.ToDictionary(strJson);

                vi.PicUrl   = dic.ContainsKey("pic") ? dic["pic"].ToString() : "";
                vi.FlashUrl = dic.ContainsKey("swfOutsideUrl") ? dic["swfOutsideUrl"].ToString() : "";
                vi.Title    = dic.ContainsKey("title") ? dic["title"].ToString() : "";

                return(vi);
            }
            catch (Exception ex) {
                logger.Error("getUrl=" + playUrl);
                logger.Error(ex.Message);

                return(vi);
            }
        }
Example #4
0
        public VideoInfo GetInfo(String url)
        {
            String vid = strUtil.TrimStart(url, "http://www.tudou.com/programs/view/").TrimEnd('/');

            String flashUrl = string.Format("http://www.tudou.com/v/{0}/v.swf", vid);

            VideoInfo vi = new VideoInfo();

            vi.PlayUrl  = url;
            vi.FlashId  = vid;
            vi.FlashUrl = flashUrl;

            try {
                String pageBody = PageLoader.Download(url);

                Match  mt    = Regex.Match(pageBody, "<title>([^<]+?)</title>");
                String title = VideoHelper.GetTitle(mt.Groups[1].Value);

                Match  m      = Regex.Match(pageBody, "thumbnail[^']+?'([^']+?)'");
                String picUrl = m.Groups[1].Value;

                vi.Title  = title;
                vi.PicUrl = picUrl;

                return(vi);
            }
            catch (Exception ex) {
                logger.Error("getUrl=" + url);
                logger.Error(ex.Message);

                return(vi);
            }
        }
Example #5
0
        public VideoInfo GetInfo(string playUrl)
        {
            String[] arrItem = strUtil.TrimEnd(playUrl, ".shtml").Split('/');
            String   flashId = arrItem[arrItem.Length - 1];

            VideoInfo vi = new VideoInfo();

            vi.PlayUrl  = playUrl;
            vi.FlashId  = flashId;
            vi.FlashUrl = string.Format("http://v.ifeng.com/include/exterior.swf?guid={0}&AutoPlay=false", flashId);


            try {
                String pageBody = PageLoader.Download(playUrl, "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)", "utf-8");

                Match  mt      = Regex.Match(pageBody, "var videoinfo=({.+?});");
                String strJson = mt.Groups[1].Value;

                Dictionary <String, Object> dic = JSON.ToDictionary(strJson);

                vi.PicUrl = dic.ContainsKey("img") ? dic["img"].ToString() : "";
                vi.Title  = dic.ContainsKey("name") ? dic["name"].ToString() : "";


                return(vi);
            }
            catch (Exception ex) {
                logger.Error("getUrl=" + playUrl);
                logger.Error(ex.Message);
                return(vi);
            }
        }
Example #6
0
        //利用HtmlAgilityPack生成HtmlDocument
        protected HtmlDocument getDetailPageBodyHtmlDocument(string detailUrl, SpiderTemplate template, StringBuilder sb)
        {
            try {
                sb.AppendLine("抓取详细页..." + detailUrl);
                HtmlDocument htmlDoc = new HtmlDocument {
                    OptionAddDebuggingAttributes = false,
                    OptionAutoCloseOnEnd         = true,
                    OptionFixNestedTags          = true,
                    OptionReadEncoding           = true
                };

                String page;
                if (strUtil.HasText(template.DetailEncoding))
                {
                    page = PageLoader.Download(detailUrl, SpiderConfig.UserAgent, template.DetailEncoding);
                }
                else
                {
                    page = PageLoader.Download(detailUrl, SpiderConfig.UserAgent, "");
                }

                htmlDoc.LoadHtml(page);

                return(htmlDoc);
            }
            catch (Exception ex) {
                logInfo("error=抓取" + detailUrl + "发生错误:" + ex.Message, detailUrl, template, sb);
                return(null);
            }
        }
Example #7
0
        protected string getDetailPageBody(string detailUrl, SpiderTemplate template, StringBuilder sb)
        {
            try {
                sb.AppendLine("抓取详细页..." + detailUrl);

                String page;
                if (strUtil.HasText(template.DetailEncoding))
                {
                    page = PageLoader.Download(detailUrl, SpiderConfig.UserAgent, template.DetailEncoding);
                }
                else
                {
                    page = PageLoader.Download(detailUrl, SpiderConfig.UserAgent, "");
                }

                template.SiteUrl = new UrlInfo(detailUrl).SiteUrl;

                if (strUtil.IsNullOrEmpty(page))
                {
                    logInfo("error=原始页面没有内容:" + detailUrl, detailUrl, template, sb);
                }

                return(page);
            }
            catch (Exception ex) {
                logInfo("error=抓取" + detailUrl + "发生错误:" + ex.Message, detailUrl, template, sb);
                return(null);
            }
        }
Example #8
0
        private static string downloadListPageBody(SpiderTemplate s, StringBuilder sb)
        {
            String target;

            if (strUtil.HasText(s.ListEncoding))
            {
                target = PageLoader.Download(s.ListUrl, SpiderConfig.UserAgent, s.ListEncoding);
            }
            else
            {
                target = PageLoader.Download(s.ListUrl, SpiderConfig.UserAgent, "");
            }

            if (strUtil.IsNullOrEmpty(target))
            {
                logInfo("error=原始页面没有内容: " + s.ListUrl, s, sb);

                return(target);
            }

            Match match = Regex.Match(target, s.GetListBodyPattern(), RegexOptions.Singleline);

            if (match.Success)
            {
                target = match.Value;
            }
            else
            {
                target = "";
                logInfo("error=没有匹配的页面内容:" + s.ListUrl, s, sb);
            }

            return(target.Trim());
        }
Example #9
0
        public VideoInfo GetInfo(String url)
        {
            String vid = strUtil.TrimStart(url, "http://v.youku.com/v_show/id_");

            vid = strUtil.TrimEnd(vid, ".html");

            String flashUrl = string.Format("http://player.youku.com/player.php/sid/{0}/v.swf", vid);

            VideoInfo vi = new VideoInfo();

            vi.PlayUrl  = url;
            vi.FlashUrl = flashUrl;
            vi.FlashId  = vid;

            try {
                String pageBody = PageLoader.Download(url);

                Match  mt    = Regex.Match(pageBody, "<title>([^<]+?)</title>");
                String title = VideoHelper.GetTitle(mt.Groups[1].Value);

                Match m = Regex.Match(pageBody, "pic=(http://[^:]+?.ykimg.com.+?)\"");

                String picUrl = m.Groups[1].Value;

                vi.Title  = title;
                vi.PicUrl = picUrl;

                return(vi);
            }
            catch (Exception ex) {
                logger.Error("getUrl=" + url);
                logger.Error(ex.Message);
                return(vi);
            }
        }
Example #10
0
        public VideoInfo GetInfo(String url)
        {
            String vid = strUtil.TrimStart(url, "http://v.ku6.com/show/");

            vid = strUtil.TrimEnd(vid, ".html");

            String flashUrl = string.Format("http://player.ku6.com/refer/{0}/v.swf", vid);

            VideoInfo vi = new VideoInfo();

            vi.PlayUrl  = url;
            vi.FlashUrl = flashUrl;
            vi.FlashId  = vid;

            try {
                String pageBody = PageLoader.Download(url);

                Match  mt    = Regex.Match(pageBody, "<title>([^<]+?)</title>");
                String title = VideoHelper.GetTitle(mt.Groups[1].Value);

                Match  m      = Regex.Match(pageBody, "<span class=\"s_pic\">([^<]+?)</span>");
                String picUrl = m.Groups[1].Value;

                vi.Title  = title;
                vi.PicUrl = picUrl;

                return(vi);
            }
            catch (Exception ex) {
                logger.Error("getUrl=" + url);
                logger.Error(ex.Message);
                return(vi);
            }
        }
Example #11
0
        private static string downloadListPageBody(SpiderTemplate s, StringBuilder sb)
        {
            String target;

            if (strUtil.HasText(s.ListEncoding))
            {
                target = PageLoader.Download(s.ListUrl, SpiderConfig.UserAgent, s.ListEncoding);
            }
            else
            {
                target = PageLoader.Download(s.ListUrl, SpiderConfig.UserAgent, "");
            }

            if (strUtil.IsNullOrEmpty(target))
            {
                logInfo("error=原始页面没有内容: " + s.ListUrl, s, sb);

                return(target);
            }

            if (!strUtil.IsNullOrEmpty(s.GetListBodyPattern()))
            {
                HtmlDocument htmlDoc = new HtmlDocument {
                    OptionAddDebuggingAttributes = false,
                    OptionAutoCloseOnEnd         = true,
                    OptionFixNestedTags          = true,
                    OptionReadEncoding           = true
                };
                htmlDoc.LoadHtml(target);
                IEnumerable <HtmlNode> Nodes = htmlDoc.DocumentNode.QuerySelectorAll(s.GetListBodyPattern());
                if (Nodes.Count() > 0)
                {
                    target = Nodes.ToArray()[0].OuterHtml;
                    return(target.Trim());
                }
                else
                {
                    logInfo("error=没有匹配的页面内容:" + s.ListUrl, s, sb);
                    return(null);
                }
            }
            //这里未来也可以改成css选择器的方式,来细化目标url集合的范围
            //Match match = Regex.Match(target, s.GetListBodyPattern(), RegexOptions.Singleline);
            //if (match.Success)
            //{
            //    target = match.Value;
            //}
            //else
            //{
            //    target = "";
            //    logInfo("error=没有匹配的页面内容:" + s.ListUrl, s, sb);
            //}

            return(target.Trim());
        }
Example #12
0
        public void testEncoding()
        {
            // 可以提供encoding,如果不提供,根据 WebResponse 的 ContentType 进行设置。

            String page = PageLoader.Download("http://www.baidu.com/");

            Assert.IsNotEmpty(page);
            Assert.Greater(page.IndexOf("百度"), 0);

            page = PageLoader.Download("http://news.163.com/", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)", "");
            Assert.IsNotEmpty(page);
            Assert.Greater(page.IndexOf("网易新闻"), 0);
        }
Example #13
0
        public VideoInfo GetInfo(string playUrl)
        {
            String[] arrItem = playUrl.Split('#');
            String   flashId = arrItem[arrItem.Length - 1];

            VideoInfo vi = new VideoInfo();

            vi.PlayUrl  = playUrl;
            vi.FlashId  = flashId;
            vi.FlashUrl = string.Format("http://v.ifeng.com/include/exterior.swf?guid={0}&AutoPlay=false", flashId);


            try {
                String pageBody = PageLoader.Download(playUrl, "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)", "utf-8");

                Match  mt      = Regex.Match(pageBody, "<li name=\"" + flashId + "\">(.+?)</li>", RegexOptions.Singleline);
                String strHtml = mt.Value.Replace("'", "\"");

                System.IO.StringReader strRd  = new System.IO.StringReader(strHtml);
                XmlTextReader          reader = new XmlTextReader(strRd);

                reader.MoveToContent();
                while (reader.Read())
                {
                    if (equal(reader, "img"))
                    {
                        vi.PicUrl = reader.GetAttribute("src");
                    }
                    else if (equal(reader, "h4"))
                    {
                        vi.Title = reader.ReadString();
                    }
                }


                return(vi);
            }
            catch (Exception ex) {
                logger.Error("getUrl=" + playUrl);
                logger.Error(ex.Message);
                return(vi);
            }
        }
Example #14
0
        public VideoInfo GetInfo(String url)
        {
            String turl = strUtil.TrimStart(url, "http://");

            turl = strUtil.TrimStart(turl, "www.56.com");
            turl = strUtil.TrimEnd(turl, ".html");
            turl = turl.TrimStart('/');
            String[] arrItem = turl.Split('/');

            String vid      = strUtil.TrimStart(arrItem[1], "v_");
            String flashUrl = string.Format("http://player.56.com/v_{0}.swf", vid);

            VideoInfo vi = new VideoInfo();

            vi.PlayUrl  = url;
            vi.FlashId  = vid;
            vi.FlashUrl = flashUrl;

            try {
                String pageBody = PageLoader.Download(url);
                Match  mt       = Regex.Match(pageBody, "<title>([^<]+?)</title>");
                String title    = VideoHelper.GetTitle(mt.Groups[1].Value);

                String pattern = "\"img\":\"([^\"]+?)\"";

                Match  m      = Regex.Match(pageBody, pattern);
                String picUrl = m.Groups[1].Value;
                picUrl = picUrl.Replace("\\", "").Trim();

                vi.Title  = title;
                vi.PicUrl = picUrl;

                return(vi);
            }
            catch (Exception ex) {
                logger.Error("getUrl=" + url);
                logger.Error(ex.Message);

                return(vi);
            }
        }
Example #15
0
        /// <summary>
        /// 得到优库视频信息
        /// </summary>
        /// <param name="url"></param>
        /// <returns></returns>
        public static YouKuInfo GetYouKuInfo(string url)
        {
            string vid = GetYouKuVideoId(url);

            if (StringPlus.IsNullOrEmpty(vid))
            {
                return(null);
            }

            string jsonData = PageLoader.Download(string.Format(youKuJsonDataApiUrl, vid));

            if (StringPlus.IsNullOrEmpty(jsonData))
            {
                return(null);
            }

            string strStart = "{\"data\":[";

            if (jsonData.StartsWith(strStart))
            {
                jsonData = jsonData.Replace(strStart, "");
            }

            string strEnd = "}";

            if (jsonData.EndsWith(strEnd))
            {
                int FirstPoint = jsonData.LastIndexOf(strEnd);
                if (FirstPoint > 0)
                {
                    jsonData = jsonData.Substring(0, FirstPoint);
                }
            }

            YouKuInfo info = (YouKuInfo)YSWL.Json.Conversion.JsonConvert.Import(typeof(YouKuInfo), jsonData);

            return(info);
        }
Example #16
0
        /// <summary>
        /// 得到Ku6视频信息
        /// </summary>
        /// <param name="url"></param>
        /// <returns></returns>
        public static Ku6Info GetKu6Info(string url)
        {
            Ku6Info info = new Ku6Info();
            //if (!IsKu6VideoUrl(url))
            //{
            //    return null;
            //}
            string xmlData = PageLoader.Download(string.Format(ku6XmlDataApiUrl, url));

            XmlDocument xmldoc = new XmlDocument();

            xmldoc.LoadXml(xmlData);

            XmlNode xmlNodeType = xmldoc.SelectSingleNode("root/result");

            int type = -1;

            if (null != xmlNodeType && null != xmlNodeType.Attributes.GetNamedItem("type"))
            {
                type = Convert.ToInt32(xmlNodeType.Attributes.GetNamedItem("type").Value);
            }
            else
            {
                return(null);
            }
            if (type == -1)
            {
                return(null);
            }
            info.type = type;

            XmlNode xmlNodeVid = xmldoc.SelectSingleNode("root/result/vid");

            if (null != xmlNodeVid)
            {
                info.vid = xmlNodeVid.InnerText;
            }

            XmlNode xmlNodeCoverurl = xmldoc.SelectSingleNode("root/result/coverurl");

            if (null != xmlNodeCoverurl)
            {
                info.coverurl = xmlNodeCoverurl.InnerText;
            }

            XmlNode xmlNodeFlash = xmldoc.SelectSingleNode("root/result/flash");

            if (null != xmlNodeFlash)
            {
                info.flash = xmlNodeFlash.InnerText;
            }

            XmlNode xmlNodeTitle = xmldoc.SelectSingleNode("root/result/title");

            if (null != xmlNodeTitle)
            {
                info.title = xmlNodeTitle.InnerText;
            }

            XmlNode xmlNodeDesc = xmldoc.SelectSingleNode("root/result/desc");

            if (null != xmlNodeDesc)
            {
                info.desc = xmlNodeDesc.InnerText;
            }
            return(info);
        }