예제 #1
0
        //这是一个通讯的接口方法,不做采集规则的处理,所有需要采集的网页均调用此防范
        //由此方法调用cGatherWeb.GetGatherData,做次方法的目的是为了可以处理错误重试

        private DataTable GetGatherData(string Url, cGlobalParas.WebCode webCode, string cookie, string startPos, string endPos, string sPath, bool IsAjax)
        {
            cGatherWeb gWeb = new cGatherWeb();

            gWeb.CutFlag = m_TaskSplitData.CutFlag;

            DataTable tmpData;
            int       AgainTime = 0;

GatherAgain:

            try
            {
                tmpData = gWeb.GetGatherData(Url, m_WebCode, m_Cookie, m_gStartPos, m_gEndPos, m_SavePath, IsAjax);
            }
            catch (System.Exception ex)
            {
                AgainTime++;

                if (AgainTime > m_AgainNumber)
                {
                    if (m_IsErrorLog == true)
                    {
                        //保存出错日志
                    }

                    throw ex;
                }
                else
                {
                    if (m_Ignore404 == true && ex.Message.Contains("404"))
                    {
                        if (m_IsErrorLog == true)
                        {
                            //保存出错日志
                        }

                        throw ex;
                    }
                    else
                    {
                        e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Error).ToString() + "网址:" + Url + "访问发生错,错误信息:" + ex.Message + ",等待3秒重试\n", this.IsErrorLog));

                        Thread.Sleep(3000);

                        e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Warning).ToString() + Url + "正在进行第" + AgainTime + "次重试\n", this.IsErrorLog));

                        //返回重试
                        goto GatherAgain;
                    }
                }
            }

            return(tmpData);
        }
예제 #2
0
        //对Url中文部分进行编码,返回编码后的Url,
        //注意:只对中文进行编码
        static public string UrlEncode(string Url, cGlobalParas.WebCode uEncoding)
        {
            string DemoUrl = Url;

            Regex           re = new Regex("[\\u4e00-\\u9fa5]", RegexOptions.None);
            MatchCollection mc = re.Matches(DemoUrl);

            switch (uEncoding)
            {
            case cGlobalParas.WebCode.utf8:
                foreach (Match ma in mc)
                {
                    DemoUrl = DemoUrl.Replace(ma.Value.ToString(), HttpUtility.UrlEncode(ma.Value.ToString(), Encoding.UTF8));
                }
                break;

            case cGlobalParas.WebCode.gb2312:
                foreach (Match ma in mc)
                {
                    DemoUrl = DemoUrl.Replace(ma.Value.ToString(), HttpUtility.UrlEncode(ma.Value.ToString(), Encoding.GetEncoding("gb2312")));
                }
                break;

            case cGlobalParas.WebCode.gbk:
                foreach (Match ma in mc)
                {
                    DemoUrl = DemoUrl.Replace(ma.Value.ToString(), HttpUtility.UrlEncode(ma.Value.ToString(), Encoding.GetEncoding("gbk")));
                }
                break;

            case cGlobalParas.WebCode.big5:
                foreach (Match ma in mc)
                {
                    DemoUrl = DemoUrl.Replace(ma.Value.ToString(), HttpUtility.UrlEncode(ma.Value.ToString(), Encoding.GetEncoding("big5")));
                }
                break;

            default:
                foreach (Match ma in mc)
                {
                    DemoUrl = DemoUrl.Replace(ma.Value.ToString(), HttpUtility.UrlEncode(ma.Value.ToString(), Encoding.UTF8));
                }
                break;
            }

            return(DemoUrl);
        }
예제 #3
0
        ///根据指定的导航规则进行页面导航,在1.6版本中,增加了多层导航的功能
        ///网址导航是属于一对多的关系,即每一级别的导航都是属于一对多(也会是一对一的关系)
        ///在此无论是几级导航,返回的都是最终的需要采集内容的网址
        ///因为是多层导航,所以是属于递归的一种算法
        ///解析网址后返回的都是标准网址,不会存在相对网址的情况
        public List <string> ParseUrlRule(string Url, List <cNavigRule> nRules, cGlobalParas.WebCode webCode, string cookie)
        {
            List <string> pUrls = new List <string>();
            List <string> Urls  = new List <string>();

            pUrls.Add(Url);

            //第一层导航分解都是从一个单一网址进行,之所以
            //选择集合,是为了统一调用接口参数
            try
            {
                Urls = PUrlRule(pUrls, 1, nRules, webCode, cookie);
            }
            catch (System.Exception ex)
            {
                //导航失败,无法解析导航规则
                return(null);
            }

            return(Urls);
        }
예제 #4
0
        //根据指定网址判断当前页面的编码
        static public string GetWebpageCode(string url, cGlobalParas.WebCode WebCode)
        {
            string charSet = "";

            WebClient myWebClient = new WebClient();

            myWebClient.Credentials = CredentialCache.DefaultCredentials;

            //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号)
            byte[] myDataBuffer = myWebClient.DownloadData(url);
            string strWebData   = Encoding.Default.GetString(myDataBuffer);

            //获取网页字符编码描述信息
            Match  charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)\"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
            string webCharSet   = charSetMatch.Groups[2].Value;

            if (charSet == null || charSet == "")
            {
                charSet = webCharSet;
            }

            return(charSet);
        }
예제 #5
0
        /// <summary>
        /// 获取指定网址源码
        /// </summary>
        /// <param name="url">网址</param>
        /// <param name="webCode">网页编码</param>
        /// <param name="cookie">网页cookie</param>
        /// <param name="startPos">获取网页源码的起始位置</param>
        /// <param name="endPos">获取网页源码的终止位置</param>
        /// <param name="IsCutnr">是否截取回车换行符,默认为true,截取</param>
        /// <returns></returns>

        public string GetHtml(string url, cGlobalParas.WebCode webCode, string cookie, string startPos, string endPos, bool IsCutnr, bool IsAjax)
        {
            //判断网页编码
            Encoding wCode;
            string   PostPara = "";


            CookieContainer CookieCon = new CookieContainer();

            HttpWebRequest wReq;

            if (Regex.IsMatch(url, @"<POST>.*</POST>", RegexOptions.IgnoreCase))
            {
                wReq = (HttpWebRequest)WebRequest.Create(@url.Substring(0, url.IndexOf("<POST>")));
            }
            else
            {
                wReq = (HttpWebRequest)WebRequest.Create(@url);
            }


            wReq.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.0; .NET CLR 1.1.4322; .NET CLR 2.0.50215;)";
            //wReq.Headers.Add("Accept-Language", "zh-cn,en-us;q=0.5");
            //wReq.Headers.Add("Accept-Encoding", "gzip, deflate");
            //wReq.SendChunked = true;
            //wReq.TransferEncoding = "utf-8";

            Match  a    = Regex.Match(url, @"(http://).[^/]*[?=/]", RegexOptions.IgnoreCase);
            string url1 = a.Groups[0].Value.ToString();

            wReq.Referer = url1;

            //判断是否有cookie
            if (cookie != "")
            {
                CookieCollection cl = new CookieCollection();

                //foreach (string sc in cookie.Split(';'))
                //{
                //    string ss = sc.Trim();
                //    cl.Add(new Cookie(ss.Split('=')[0].Trim(), ss.Split('=')[1].Trim(), "/"));
                //}

                foreach (string sc in cookie.Split(';'))
                {
                    string ss = sc.Trim();
                    if (ss.IndexOf("&") > 0)
                    {
                        foreach (string s1 in ss.Split('&'))
                        {
                            string s2 = s1.Trim();
                            string s4 = s2.Substring(s2.IndexOf("=") + 1, s2.Length - s2.IndexOf("=") - 1);

                            cl.Add(new Cookie(s2.Split('=')[0].Trim(), s4, "/"));
                        }
                    }
                    else
                    {
                        string s3 = sc.Trim();
                        cl.Add(new Cookie(s3.Split('=')[0].Trim(), s3.Split('=')[1].Trim(), "/"));
                    }
                }


                CookieCon.Add(new Uri(url), cl);
                wReq.CookieContainer = CookieCon;
            }

            //判断是否含有POST参数
            if (Regex.IsMatch(url, @"(?<=<POST>)[\S\s]*(?=</POST>)", RegexOptions.IgnoreCase))
            {
                Match s = Regex.Match(url, @"(?<=<POST>).*(?=</POST>)", RegexOptions.IgnoreCase);
                PostPara = s.Groups[0].Value.ToString();
                byte[] pPara = Encoding.ASCII.GetBytes(PostPara);

                wReq.ContentType   = "application/x-www-form-urlencoded";
                wReq.ContentLength = pPara.Length;

                wReq.Method = "POST";

                System.IO.Stream reqStream = wReq.GetRequestStream();
                reqStream.Write(pPara, 0, pPara.Length);
                reqStream.Close();
            }
            else
            {
                wReq.Method = "GET";
            }

            //设置页面超时时间为12秒
            wReq.Timeout = 12000;

            HttpWebResponse wResp = (HttpWebResponse)wReq.GetResponse();

            System.IO.Stream respStream = wResp.GetResponseStream();
            string           strWebData = "";

            switch (webCode)
            {
            case cGlobalParas.WebCode.auto:
                try
                {
                    wCode = Encoding.Default;
                    string cType        = wResp.ContentType.ToLower();
                    Match  charSetMatch = Regex.Match(cType, "(?<=charset=)([^<]*)*", RegexOptions.IgnoreCase | RegexOptions.Multiline);
                    string webCharSet   = charSetMatch.ToString();
                    wCode = System.Text.Encoding.GetEncoding(webCharSet);
                }
                catch
                {
                    wCode = Encoding.Default;
                }

                break;

            case cGlobalParas.WebCode.gb2312:
                wCode = Encoding.GetEncoding("gb2312");
                break;

            case cGlobalParas.WebCode.gbk:
                wCode = Encoding.GetEncoding("gbk");
                break;

            case cGlobalParas.WebCode.utf8:
                wCode = Encoding.UTF8;
                break;

            default:
                wCode = Encoding.UTF8;
                break;
            }


            if (wResp.ContentEncoding == "gzip")
            {
                GZipStream             myGZip = new GZipStream(respStream, CompressionMode.Decompress);
                System.IO.StreamReader reader;
                reader     = new System.IO.StreamReader(myGZip, wCode);
                strWebData = reader.ReadToEnd();
                reader.Close();
                reader.Dispose();
            }
            else
            {
                System.IO.StreamReader reader;
                reader     = new System.IO.StreamReader(respStream, wCode);
                strWebData = reader.ReadToEnd();
                reader.Close();
                reader.Dispose();
            }



            //去除回车换行符号
            if (IsCutnr == true)
            {
                strWebData = Regex.Replace(strWebData, "([\\r\\n])[\\s]+", "", RegexOptions.IgnoreCase | RegexOptions.Multiline);
                strWebData = Regex.Replace(strWebData, "\\n", "", RegexOptions.IgnoreCase | RegexOptions.Multiline);
                strWebData.Replace("\\r\\n", "");
            }

            //获取此页面的编码格式,并对源码进行一次判断,无论用户是否指定了网页代码
            //Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)\"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
            //string webCharSet = charSetMatch.Groups[2].Value;
            //string charSet = webCharSet;

            //if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != wCode)
            //{
            //    byte[] myDataBuffer;

            //    myDataBuffer = System.Text.Encoding.GetEncoding(charSet).GetBytes(strWebData);
            //    strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);

            //}

            //按照截取网页的起始标志和终止标志进行截取
            //如果起始或终止截取标识有一个为空,则不进行截取
            if (startPos != "" && endPos != "")
            {
                string Splitstr = "(" + startPos + ").*?(" + endPos + ")";

                Match aa = Regex.Match(strWebData, Splitstr);
                strWebData = aa.Groups[0].ToString();
            }

            if (IsAjax == true)
            {
                strWebData = System.Web.HttpUtility.UrlDecode(strWebData, Encoding.UTF8);
            }

            this.m_WebpageSource = strWebData;
            return(strWebData);
        }
예제 #6
0
        /// <summary>
        /// 采集网页数据
        /// </summary>
        /// <param name="Url">网页地址</param>
        /// <param name="StartPos">起始采集位置</param>
        /// <param name="EndPos">终止采集位置</param>
        /// <returns></returns>
        public DataTable  GetGatherData(string Url, cGlobalParas.WebCode webCode, string cookie, string startPos, string endPos, string sPath, bool IsAjax)
        {
            tempData = new DataTable("tempData");
            int    i;
            int    j;
            string strCut         = "";
            bool   IsDownloadFile = false;

            #region 构建表结构,并构建截取正则

            //根据页面截取的标志构建表结构
            for (i = 0; i < this.CutFlag.Count; i++)
            {
                tempData.Columns.Add(new DataColumn(this.CutFlag[i].Title, typeof(string)));

                if (this.CutFlag[i].DataType != (int)cGlobalParas.GDataType.Txt && IsDownloadFile == false)
                {
                    IsDownloadFile = true;
                }
            }

            //根据用户指定的页面截取位置构造正则表达式
            for (i = 0; i < this.CutFlag.Count; i++)
            {
                strCut += "(?<" + this.CutFlag[i].Title + ">" + cTool.RegexReplaceTrans(this.CutFlag[i].StartPos) + ")";

                //strCut += "(?<=" + cTool.RegexReplaceTrans(this.CutFlag[i].StartPos) + ")";

                switch (this.CutFlag[i].LimitSign)
                {
                case (int)cGlobalParas.LimitSign.NoLimit:
                    strCut += ".*?";
                    break;

                case (int)cGlobalParas.LimitSign.NoWebSign:
                    strCut += "[^<>]*?";
                    break;

                case (int)cGlobalParas.LimitSign.OnlyCN:
                    strCut += "[\\u4e00-\\u9fa5]*?";
                    break;

                case (int)cGlobalParas.LimitSign.OnlyDoubleByte:
                    strCut += "[^\\x00-\\xff]*?";
                    break;

                case (int)cGlobalParas.LimitSign.OnlyNumber:
                    strCut += "[\\d]*?";
                    break;

                case (int)cGlobalParas.LimitSign.OnlyChar:
                    strCut += "[\\x00-\\xff]*?";
                    break;

                case (int)cGlobalParas.LimitSign.Custom:
                    //strCut += cTool.RegexReplaceTrans(this.CutFlag[i].RegionExpression.ToString());
                    strCut += this.CutFlag[i].RegionExpression.ToString();
                    break;

                default:
                    strCut += "[\\S\\s]*?";
                    break;
                }
                strCut += "(?=" + cTool.RegexReplaceTrans(this.CutFlag[i].EndPos) + ")|";
            }

            #endregion

            #region 获取网页源码

            int rowCount = this.CutFlag.Count;

            //去掉最后一个“|”
            strCut = strCut.Substring(0, strCut.Length - 1);

            //获取网页信息
            //判断传入的Url是否正确,如果不正确,则返回空数据
            if (Regex.IsMatch(Url, "[\"\\s]"))
            {
                Match aa = Regex.Match(Url, "[\"\\s]");

                tempData = null;
                return(tempData);
            }

            try
            {
                GetHtml(Url, webCode, cookie, startPos, endPos, true, IsAjax);
            }
            catch (System.Web.HttpException ex)
            {
                throw ex;
            }

            #endregion

            //开始获取截取内容
            Regex           re = new Regex(@strCut, RegexOptions.IgnoreCase | RegexOptions.Multiline);
            MatchCollection mc = re.Matches(this.WebpageSource);

            if (mc.Count == 0)
            {
                tempData = null;
                return(tempData);
            }

            DataRow drNew = null;

            i = 0;

            #region 开始输出截取字符,并拼成一个表

            //开始根据采集的数据构造数据表进行输出
            //在此需要处理采集数据有可能错行的问题
            //下面被注释的代码是最初构建数据表的代码,但会出现错行现象

            //Match ma;

            int rows = 0; //统计共采集了多少行
            int m    = 0; //计数使用

            try
            {
                while (m < mc.Count)
                {
                    //新建新行
                    drNew = tempData.NewRow();
                    rows++;

                    for (i = 0; i < this.CutFlag.Count; i++)
                    {
                        if (m < mc.Count)
                        {
                            if (i == 0)
                            {
                                while (!mc[m].Value.StartsWith(this.CutFlag[i].StartPos, StringComparison.CurrentCultureIgnoreCase))
                                {
                                    m++;
                                    if (m >= mc.Count)
                                    {
                                        //退出所有循环
                                        goto ExitWhile;
                                    }
                                }

                                drNew[i] = mc[m].Value.Substring(this.CutFlag[i].StartPos.Length, mc[m].Value.Length - this.CutFlag[i].StartPos.Length);

                                m++;
                            }
                            else
                            {
                                if (mc[m].Value.StartsWith(this.CutFlag[i].StartPos, StringComparison.CurrentCultureIgnoreCase))
                                {
                                    drNew[i] = mc[m].Value.Substring(this.CutFlag[i].StartPos.Length, mc[m].Value.Length - this.CutFlag[i].StartPos.Length);

                                    m++;
                                }
                                else
                                {
                                    if (mc[m].Value.StartsWith(this.CutFlag[i - 1].StartPos, StringComparison.CurrentCultureIgnoreCase))
                                    {
                                        m++;
                                        i--;
                                    }
                                    else
                                    {
                                        if (i < this.CutFlag.Count - 1)
                                        {
                                            if (mc[m].Value.StartsWith(this.CutFlag[i + 1].StartPos, StringComparison.CurrentCultureIgnoreCase))
                                            {
                                            }
                                            else
                                            {
                                                m++;
                                                i--;
                                            }
                                        }
                                        else
                                        {
                                            m++;
                                            i--;
                                        }
                                        //当采集时发生了缺少采集内容,采用此方法进行采集内容补空
                                        //drNew[i] = "";
                                        //continue;
                                    }
                                }
                            }
                        }
                    }
                    tempData.Rows.Add(drNew);
                    drNew = null;
                }
            }
            catch (System.Exception ex)
            {
                throw ex;
            }

            #endregion

            #region 开始进行输出控制,进行获取数据加工

ExitWhile:

            //在此判断是否需要在输出时进行数据的限制,根据任务版本1.2增加了数据输出的限制

            for (i = 0; i < this.CutFlag.Count; i++)
            {
                switch (this.CutFlag[i].ExportLimit)
                {
                case (int)cGlobalParas.ExportLimit.ExportNoLimit:

                    break;

                case (int)cGlobalParas.ExportLimit.ExportNoWebSign:
                    for (int index = 0; index < tempData.Rows.Count; index++)
                    {
                        tempData.Rows[index][i] = getTxt(tempData.Rows[index][i].ToString());
                    }
                    break;

                case (int)cGlobalParas.ExportLimit.ExportPrefix:
                    for (int index = 0; index < tempData.Rows.Count; index++)
                    {
                        tempData.Rows[index][i] = this.CutFlag[i].ExportExpression + tempData.Rows[index][i].ToString();
                    }
                    break;

                case (int)cGlobalParas.ExportLimit.ExportReplace:
                    for (int index = 0; index < tempData.Rows.Count; index++)
                    {
                        string oStr = this.CutFlag[i].ExportExpression.Substring(1, this.CutFlag[i].ExportExpression.IndexOf(",") - 2);
                        string nStr = this.CutFlag[i].ExportExpression.Substring(this.CutFlag[i].ExportExpression.IndexOf(",") + 2, this.CutFlag[i].ExportExpression.Length - this.CutFlag[i].ExportExpression.IndexOf(",") - 3);
                        tempData.Rows[index][i] = tempData.Rows[index][i].ToString().Replace(oStr, nStr);
                    }
                    break;

                case (int)cGlobalParas.ExportLimit.ExportSuffix:
                    for (int index = 0; index < tempData.Rows.Count; index++)
                    {
                        tempData.Rows[index][i] = tempData.Rows[index][i].ToString() + this.CutFlag[i].ExportExpression;
                    }
                    break;

                case (int)cGlobalParas.ExportLimit.ExportTrimLeft:
                    for (int index = 0; index < tempData.Rows.Count; index++)
                    {
                        int len   = tempData.Rows[index][i].ToString().Length;
                        int lefti = int.Parse(this.CutFlag[i].ExportExpression.ToString());
                        if (tempData.Rows[index][i].ToString().Length > lefti)
                        {
                            tempData.Rows[index][i] = tempData.Rows[index][i].ToString().Substring(lefti, len - lefti);
                        }
                    }
                    break;

                case (int)cGlobalParas.ExportLimit.ExportTrimRight:
                    for (int index = 0; index < tempData.Rows.Count; index++)
                    {
                        int len    = tempData.Rows[index][i].ToString().Length;
                        int righti = int.Parse(this.CutFlag[i].ExportExpression.ToString());
                        if (tempData.Rows[index][i].ToString().Length > righti)
                        {
                            tempData.Rows[index][i] = tempData.Rows[index][i].ToString().Substring(0, len - righti);
                        }
                    }
                    break;

                case (int)cGlobalParas.ExportLimit.ExportTrim:
                    for (int index = 0; index < tempData.Rows.Count; index++)
                    {
                        tempData.Rows[index][i] = tempData.Rows[index][i].ToString().Trim();
                    }
                    break;

                case (int)cGlobalParas.ExportLimit.ExportRegexReplace:
                    for (int index = 0; index < tempData.Rows.Count; index++)
                    {
                        //string oStr=cTool.RegexReplaceTrans( this.CutFlag[i].ExportExpression .Substring (1,this.CutFlag[i].ExportExpression.IndexOf (",")-2));
                        //string nStr = this.CutFlag[i].ExportExpression.Substring(this.CutFlag[i].ExportExpression.IndexOf(",") + 2, this.CutFlag[i].ExportExpression.Length - this.CutFlag[i].ExportExpression.IndexOf(",") - 3);
                        //tempData.Rows[index][i] = Regex.Replace(tempData.Rows[index][i].ToString(), oStr, nStr, RegexOptions.IgnoreCase | RegexOptions.Multiline);

                        string oStr = this.CutFlag[i].ExportExpression.Substring(1, this.CutFlag[i].ExportExpression.IndexOf(",") - 2);
                        string nStr = this.CutFlag[i].ExportExpression.Substring(this.CutFlag[i].ExportExpression.IndexOf(",") + 2, this.CutFlag[i].ExportExpression.Length - this.CutFlag[i].ExportExpression.IndexOf(",") - 3);
                        tempData.Rows[index][i] = Regex.Replace(tempData.Rows[index][i].ToString(), oStr, nStr, RegexOptions.IgnoreCase | RegexOptions.Multiline);
                    }
                    break;

                default:

                    break;
                }
            }

            #endregion

            #region 针对采集需要下载文件的字段进行文件下载处理
            //判断是否存在有下载文件的任务,如果有,则开始下载,因为此功能设计最初是下载图片使用
            //并非是专用的下载工具,所以对下载处理并没有单独进行线程处理

            try
            {
                if (IsDownloadFile == true)
                {
                    if (sPath == "")
                    {
                        sPath = Program.getPrjPath() + "data\\tem_file";
                    }

                    if (!Directory.Exists(sPath))
                    {
                        Directory.CreateDirectory(sPath);
                    }

                    string FileUrl          = "";
                    string DownloadFileName = "";

                    for (i = 0; i < rows; i++)
                    {
                        for (j = 0; j < this.CutFlag.Count; j++)
                        {
                            if (this.CutFlag[j].DataType != (int)cGlobalParas.GDataType.Txt)
                            {
                                FileUrl = tempData.Rows[i][j].ToString();

                                //开始获取下载文件名称
                                Regex           s      = new Regex(@"(?<=/)[^/]*", RegexOptions.IgnoreCase | RegexOptions.Multiline);
                                MatchCollection urlstr = s.Matches(FileUrl);
                                if (urlstr.Count == 0)
                                {
                                    DownloadFileName = FileUrl;
                                }
                                else
                                {
                                    DownloadFileName = urlstr[urlstr.Count - 1].ToString();
                                }
                                DownloadFileName = sPath + "\\" + DownloadFileName;

                                if (string.Compare(FileUrl.Substring(0, 4), "http", true) == 0)
                                {
                                    DownloadFile(FileUrl, DownloadFileName);
                                }
                                else
                                {
                                    if (FileUrl.Substring(0, 1) == "/")
                                    {
                                        Url     = Url.Substring(7, Url.Length - 7);
                                        Url     = FileUrl.Substring(0, Url.IndexOf("/"));
                                        Url     = "http://" + Url;
                                        FileUrl = Url + FileUrl;
                                    }
                                    else if (FileUrl.IndexOf("/") <= 0)
                                    {
                                        Url     = Url.Substring(0, Url.LastIndexOf("/") + 1);
                                        FileUrl = Url + FileUrl;
                                    }
                                    else
                                    {
                                        Url     = Url.Substring(0, Url.LastIndexOf("/") + 1);
                                        FileUrl = Url + FileUrl;
                                    }

                                    DownloadFile(FileUrl, DownloadFileName);
                                }
                            }
                        }
                    }
                }
            }
            catch (System.Exception ex)
            {
                throw ex;
            }
            #endregion

            return(tempData);
        }
예제 #7
0
        ///解析导航网页
        ///判断是否为最后一个级别,在这里需要注意一个问题,因为有可能
        ///存储的级别并不是按照顺序进行的,所以,要根据传入的级别Level进行
        ///判断,否则会出现错误,导航网页的解析必须是按照顺序的,否则会
        ///无法解析
        public List <string> PUrlRule(List <string> pUrl, int Level, List <cNavigRule> nRules, cGlobalParas.WebCode webCode, string cookie)
        {
            List <string> tmpUrls;
            List <string> Urls = new List <string> ();

            if (nRules.Count == 0)
            {
                Urls.Add(pUrl[0].ToString());
                return(Urls);
            }

            string UrlRule = "";
            int    i;

            //根据Level得到需要导航级别的导航规则
            for (i = 0; i < nRules.Count; i++)
            {
                if (Level == nRules[i].Level)
                {
                    UrlRule = nRules [i].NavigRule;
                    break;
                }
            }


            for (i = 0; i < pUrl.Count; i++)
            {
                tmpUrls = new List <string>();

                tmpUrls = GetUrlsByRule(pUrl[i].ToString(), UrlRule, webCode, cookie);

                if (tmpUrls != null)
                {
                    Urls.AddRange(tmpUrls);
                }
            }

            //判断是否为最底级的导航,如果是则返回,如果不是则继续导航
            if (Level == nRules.Count)
            {
                return(Urls);
            }
            else
            {
                List <string> rUrls = PUrlRule(Urls, Level + 1, nRules, webCode, cookie);
                return(rUrls);
            }
        }
예제 #8
0
        //根据导航规则,获取网页地址,是一个集合
        public List <string> GetUrlsByRule(string Url, string UrlRule, cGlobalParas.WebCode webCode, string cookie)
        {
            string        Url1;
            List <string> Urls = new List <string> ();

            if (UrlRule.Trim() == "")
            {
                Urls.Add(Url);
                return(Urls);
            }

            //判断网址是否存在参数,如果存在参数则取出第一个可用网址
            if (Regex.IsMatch(Url, "{.*}"))
            {
                List <string> Urls1 = SplitWebUrl(Url);  //,IsUrlEncode ,UrlEncode
                Url1 = Urls1[0].ToString();
            }
            else
            {
                Url1 = Url;
            }

            //返回网址的源码,并根据提取规则提取导航的网址
            //string UrlSource= cTool.GetHtmlSource(Url1,true );

            cGatherWeb gW        = new cGatherWeb();
            string     UrlSource = gW.GetHtml(Url1, webCode, cookie, "", "", true, false);

            gW = null;

            if (UrlSource == "")
            {
                return(null);
            }

            //string Rule=@"(?<=href=[\W])" + cTool.RegexReplaceTrans(UrlRule) + @"(\S[^'"">]*)(?=[\s'""])";
            string Rule = "";

            if (UrlRule.StartsWith("<Regex:"))
            {
                Rule = @"(?<=[href=|src=|open(][\W])";

                //处理前缀
                string strPre = UrlRule.Substring(UrlRule.IndexOf("<Regex:") + 7, UrlRule.IndexOf(">") - 7);
                Rule += strPre;

                //处理中间内容
                string cma = @"(?<=<Common:)\S+?(?=>)";

                Regex           cmas = new Regex(cma, RegexOptions.IgnoreCase | RegexOptions.Multiline);
                MatchCollection cs   = cmas.Matches(UrlRule);
                foreach (Match ma in cs)
                {
                    Rule += @"(\S*)" + ma.Value.ToString();
                }

                //处理后缀
                if (Regex.IsMatch(UrlRule, "<End:"))
                {
                    string s = UrlRule.Substring(UrlRule.IndexOf("<End:") + 5, UrlRule.Length - UrlRule.IndexOf("<End:") - 6);
                    Rule += @"(\S*)" + s;
                }
                else
                {
                    Rule += @"(\S[^'"">]*)(?=[\s'""])";
                }
            }
            else
            {
                Rule = @"(?<=[href=|src=|open(][\W])" + cTool.RegexReplaceTrans(UrlRule) + @"(\S[^'"">]*)(?=[\s'""])";
            }

            Regex           re = new Regex(Rule, RegexOptions.IgnoreCase | RegexOptions.Multiline);
            MatchCollection aa = re.Matches(UrlSource);

            DataTable d1 = new DataTable();

            d1.Columns.Add("Name");

            foreach (Match ma in aa)
            {
                //Urls.Add(ma.Value.ToString());
                d1.Rows.Add(ma.Value.ToString());
            }

            //导航时可能会获取重复网址的列表,所以需要去重
            //去除重复行

            string[] strComuns = new string[d1.Columns.Count];

            for (int m = 0; m < d1.Columns.Count; m++)
            {
                strComuns[m] = d1.Columns[m].ColumnName;
            }

            DataView dv = new DataView(d1);

            DataTable d2 = dv.ToTable(true, strComuns);

            for (int i = 0; i < d2.Rows.Count; i++)
            {
                if (string.Compare(d2.Rows[i][0].ToString().Substring(0, 4), "http", true) != 0)
                {
                    string PreUrl = Url;

                    if (d2.Rows[i][0].ToString().Substring(0, 1) == "/")
                    {
                        PreUrl = PreUrl.Substring(7, PreUrl.Length - 7);
                        PreUrl = PreUrl.Substring(0, PreUrl.IndexOf("/"));
                        PreUrl = "http://" + PreUrl;
                    }
                    else
                    {
                        Match a = Regex.Match(PreUrl, ".*/");
                        PreUrl = a.Groups[0].Value.ToString();
                    }

                    Urls.Add(PreUrl + d2.Rows[i][0].ToString());
                }
                else
                {
                    Urls.Add(d2.Rows[i][0].ToString());
                }
            }

            return(Urls);
        }