Пример #1
0
        private void DownloadSoft()
        {
            Gather.cGatherWeb gData = new Gather.cGatherWeb();

            //增加采集的标志
            Task.cWebpageCutFlag c;

            c           = new Task.cWebpageCutFlag();
            c.id        = 0;
            c.Title     = "版本";
            c.DataType  = (int)cGlobalParas.GDataType.File;
            c.StartPos  = "<a href=\"";
            c.EndPos    = "\"";
            c.LimitSign = (int)cGlobalParas.LimitSign.NoLimit;
            gData.CutFlag.Add(c);
            c = null;


            DataTable dGather = gData.GetGatherData("http://www.yijie.net/user/soft/updatesoukey.html", cGlobalParas.WebCode.utf8, "", "", "", Program.getPrjPath(), false);

            dGather = null;
            gData   = null;

            m_sender.BeginInvoke(m_senderDelegate, new object[] { true });
        }
Пример #2
0
        //用于采集导航网页分解后的单独地址
        private bool GatherParsedUrl(string Url)
        {
            cGatherWeb gWeb    = new cGatherWeb();
            DataTable  tmpData = null;

            gWeb.CutFlag = m_TaskSplitData.CutFlag;

            bool IsAjax = false;

            if (m_TaskType == cGlobalParas.TaskType.AjaxHtmlByUrl)
            {
                IsAjax = true;
            }

            try
            {
                e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + "正在采集:" + Url + "\n", this.IsErrorLog));

                if (m_IsUrlEncode == true)
                {
                    Url = cTool.UrlEncode(Url, (cGlobalParas.WebCode) int.Parse(m_UrlEncode));
                }

                //tmpData = gWeb.GetGatherData(Url, m_WebCode, m_Cookie, m_gStartPos, m_gEndPos, m_SavePath, IsAjax);
                tmpData = GetGatherData(Url, m_WebCode, m_Cookie, m_gStartPos, m_gEndPos, m_SavePath, IsAjax);

                if (tmpData != null)
                {
                    m_GatherData.Merge(tmpData);
                }

                //触发日志及采集数据的事件
                if (tmpData == null || tmpData.Rows.Count == 0)
                {
                }
                else
                {
                    e_GData(this, new cGatherDataEventArgs(m_TaskID, tmpData));
                }
                if (tmpData == null)
                {
                    e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Error).ToString() + Url + " 此地址无数据!" + "\n", this.IsErrorLog));
                }
                else
                {
                    e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + "采集完成:" + Url + "\n", this.IsErrorLog));
                }
                tmpData = null;
            }
            catch (System.Exception ex)
            {
                e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Error).ToString() + Url + "采集发生错误:" + ex.Message + "\n", this.IsErrorLog));
                onError(ex);
                return(false);
            }

            gWeb = null;

            return(true);
        }
Пример #3
0
        //这是一个通讯的接口方法,不做采集规则的处理,所有需要采集的网页均调用此防范
        //由此方法调用cGatherWeb.GetGatherData,做次方法的目的是为了可以处理错误重试

        private DataTable GetGatherData(string Url, cGlobalParas.WebCode webCode, string cookie, string startPos, string endPos, string sPath, bool IsAjax)
        {
            cGatherWeb gWeb = new cGatherWeb();

            gWeb.CutFlag = m_TaskSplitData.CutFlag;

            DataTable tmpData;
            int       AgainTime = 0;

GatherAgain:

            try
            {
                tmpData = gWeb.GetGatherData(Url, m_WebCode, m_Cookie, m_gStartPos, m_gEndPos, m_SavePath, IsAjax);
            }
            catch (System.Exception ex)
            {
                AgainTime++;

                if (AgainTime > m_AgainNumber)
                {
                    if (m_IsErrorLog == true)
                    {
                        //保存出错日志
                    }

                    throw ex;
                }
                else
                {
                    if (m_Ignore404 == true && ex.Message.Contains("404"))
                    {
                        if (m_IsErrorLog == true)
                        {
                            //保存出错日志
                        }

                        throw ex;
                    }
                    else
                    {
                        e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Error).ToString() + "网址:" + Url + "访问发生错,错误信息:" + ex.Message + ",等待3秒重试\n", this.IsErrorLog));

                        Thread.Sleep(3000);

                        e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Warning).ToString() + Url + "正在进行第" + AgainTime + "次重试\n", this.IsErrorLog));

                        //返回重试
                        goto GatherAgain;
                    }
                }
            }

            return(tmpData);
        }
Пример #4
0
        private void DownloadSoft()
        {
            Gather.cGatherWeb gData = new Gather.cGatherWeb();

            //���Ӳɼ��ı�־
            Task.cWebpageCutFlag c;

            c = new Task.cWebpageCutFlag();
            c.id = 0;
            c.Title = "�汾";
            c.DataType = (int)cGlobalParas.GDataType.File;
            c.StartPos = "<a href=\"";
            c.EndPos = "\"";
            c.LimitSign = (int)cGlobalParas.LimitSign.NoLimit;
            gData.CutFlag.Add(c);
            c = null;

            DataTable dGather = gData.GetGatherData("http://www.yijie.net/user/soft/updatesoukey.html", cGlobalParas.WebCode.utf8, "", "", "", Program.getPrjPath(),false);

            dGather = null;
            gData = null;

            m_sender.BeginInvoke(m_senderDelegate, new object[] { true });
        }
Пример #5
0
        private DataTable GatherTestData(string Url,List<cWebpageCutFlag> gCutFlags, cGlobalParas.WebCode webCode, string cookie, string startPos, string endPos, string sPath, bool IsAjax)
        {
            Gather.cGatherWeb gData = new Gather.cGatherWeb();
            gData.CutFlag = gCutFlags;
            DataTable dGather = new DataTable();
            try
            {
                dGather = gData.GetGatherData(Url, webCode ,cookie ,startPos ,endPos , sPath, IsAjax);
            }
            catch (System.Exception ex)
            {
                MessageBox.Show(rm.GetString ("Error4") + ex.Message, rm.GetString("MessageboxError"), MessageBoxButtons.OK, MessageBoxIcon.Error);
                return null;
            }

            return dGather;
        }
        //���ڲɼ�һ����ҳ������
        private bool GatherSingleUrl(string Url,bool IsNext,string NextRule)
        {
            cGatherWeb gWeb = new cGatherWeb();
            DataTable tmpData;
            string NextUrl=Url ;
            string Old_Url = NextUrl;

            //gWeb.CutFlag = m_TaskSplitData.CutFlag;

            bool IsAjax = false;

            if (m_TaskType == cGlobalParas.TaskType.AjaxHtmlByUrl)
                IsAjax = true;

            try
            {
                if (IsNext)
                {
                    do
                    {
                        Url = NextUrl;
                        Old_Url = NextUrl;

                        e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + "���ڲɼ���" + Url + "\n", this.IsErrorLog));

                        if (m_IsUrlEncode == true)
                        {
                            Url = cTool.UrlEncode(Url,(cGlobalParas.WebCode)int.Parse ( m_UrlEncode));
                        }

                        //tmpData = gWeb.GetGatherData(Url, m_WebCode, m_Cookie, m_gStartPos, m_gEndPos,m_SavePath,IsAjax );
                        tmpData = GetGatherData(Url, m_WebCode, m_Cookie, m_gStartPos, m_gEndPos, m_SavePath, IsAjax);

                        if (tmpData != null)
                        {
                            m_GatherData.Merge(tmpData);
                        }

                        //������־���ɼ����ݵ��¼�
                        if (tmpData == null || tmpData.Rows.Count == 0)
                        {
                        }
                        else
                        {
                            e_GData(this, new cGatherDataEventArgs(m_TaskID, tmpData));
                        }
                        e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + "�ɼ���ɣ�" + Url + "\n", this.IsErrorLog));

                        e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + "��ʼ������һҳ�����ȡ��һҳ��ַ\n", this.IsErrorLog));

                        string webSource = gWeb.GetHtml(Url, m_WebCode, m_Cookie, "", "", true, IsAjax);

                        string NRule="((?<=href=[\'|\"])\\S[^#+$<>\\s]*(?=[\'|\"]))[^<]*(?<=" + NextRule + ")";
                        Match charSetMatch = Regex.Match(webSource, NRule, RegexOptions.IgnoreCase | RegexOptions.Multiline);
                        string strNext = charSetMatch.Groups[1].Value;

                        if (strNext != "")
                        {
                            //�жϻ�ȡ�ĵ�ַ�Ƿ�Ϊ��Ե�ַ
                            if (strNext.Substring(0, 1) == "/")
                            {
                                string PreUrl = Url;
                                PreUrl = PreUrl.Substring(7, PreUrl.Length - 7);
                                PreUrl = PreUrl.Substring(0, PreUrl.IndexOf("/"));
                                PreUrl = "http://" + PreUrl;
                                strNext = PreUrl + strNext;
                            }
                            else if (strNext.StartsWith("http://", StringComparison.CurrentCultureIgnoreCase))
                            {
                                //NextUrl = strNext;
                            }
                            else if (strNext.StartsWith("?", StringComparison.CurrentCultureIgnoreCase))
                            {
                                Match aa = Regex.Match(Url, @".*(?=\?)");
                                string PreUrl = aa.Groups[0].Value.ToString();
                                strNext = PreUrl + strNext;
                            }
                            else
                            {
                                Match aa = Regex.Match(Url, ".*/");
                                string PreUrl = aa.Groups[0].Value.ToString();
                                strNext = PreUrl + strNext;
                            }

                            e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + "��һҳ��ַ��ȡ�ɹ���" + NextUrl + "\n", this.IsErrorLog));

                        }
                        else
                        {
                            e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + "�Ѿ�������ҳ" + NextUrl + "\n", this.IsErrorLog));

                        }

                        NextUrl = strNext;

                    }
                    while (NextUrl != "" && Old_Url != NextUrl);
                }
                else
                {

                    if (m_IsUrlEncode == true)
                    {
                        Url = cTool.UrlEncode(Url, (cGlobalParas.WebCode)int.Parse(m_UrlEncode));
                    }

                    //tmpData = gWeb.GetGatherData(Url, m_WebCode, m_Cookie, m_gStartPos, m_gEndPos, m_SavePath, IsAjax);
                    tmpData = GetGatherData(Url, m_WebCode, m_Cookie, m_gStartPos, m_gEndPos, m_SavePath, IsAjax);

                    if (tmpData != null)
                    {
                        m_GatherData.Merge(tmpData);
                    }

                    //������־���ɼ����ݵ��¼�
                    if (tmpData == null || tmpData.Rows.Count == 0)
                    {
                    }
                    else
                    {
                        e_GData(this, new cGatherDataEventArgs(m_TaskID, tmpData));
                    }
                    e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + "�ɼ���ɣ�" + Url + "\n", this.IsErrorLog));
                }

                //�����ɼ���ַ�����¼�
                e_GUrlCount(this, new cGatherUrlCountArgs(m_TaskID, cGlobalParas.UpdateUrlCountType.Gathered, 0));

                m_TaskSplitData.GatheredTrueUrlCount++;

            }
            catch (System.Exception ex)
            {
                e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Error).ToString() + Url + "�ɼ���������" + ex.Message + "\n", this.IsErrorLog));
                e_GUrlCount(this, new cGatherUrlCountArgs(m_TaskID, cGlobalParas.UpdateUrlCountType.Err, 0));
                e_GUrlCount(this, new cGatherUrlCountArgs(m_TaskID, cGlobalParas.UpdateUrlCountType.ErrUrlCountAdd, 0));
                m_TaskSplitData.GatheredTrueErrUrlCount++;
                m_TaskSplitData.GatheredErrUrlCount++;
                onError(ex);
                return false;
            }

            gWeb = null;
            tmpData = null;

            return true;
        }
        //���ڲɼ�������ҳ�ֽ��ĵ�����ַ
        private bool GatherParsedUrl(string Url)
        {
            cGatherWeb gWeb = new cGatherWeb();
            DataTable tmpData=null;

            gWeb.CutFlag = m_TaskSplitData.CutFlag;

            bool IsAjax = false;

            if (m_TaskType == cGlobalParas.TaskType.AjaxHtmlByUrl)
                IsAjax = true;

            try
            {
                e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + "���ڲɼ���" + Url + "\n", this.IsErrorLog));

                if (m_IsUrlEncode == true)
                {
                    Url = cTool.UrlEncode(Url, (cGlobalParas.WebCode)int.Parse(m_UrlEncode));
                }

                //tmpData = gWeb.GetGatherData(Url, m_WebCode, m_Cookie, m_gStartPos, m_gEndPos, m_SavePath, IsAjax);
                tmpData = GetGatherData(Url, m_WebCode, m_Cookie, m_gStartPos, m_gEndPos, m_SavePath, IsAjax);

                if (tmpData != null)
                {
                    m_GatherData.Merge(tmpData);
                }

                //������־���ɼ����ݵ��¼�
                if (tmpData == null || tmpData.Rows.Count == 0)
                {
                }
                else
                {
                    e_GData(this, new cGatherDataEventArgs(m_TaskID, tmpData));
                }
                if (tmpData == null)
                {
                    e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Error).ToString() + Url + " �˵�ַ�����ݣ�" + "\n", this.IsErrorLog));
                }
                else
                {
                    e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + "�ɼ���ɣ�" + Url + "\n", this.IsErrorLog));
                }
                tmpData = null;

            }
            catch (System.Exception ex)
            {
                e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Error).ToString() + Url + "�ɼ���������" + ex.Message + "\n", this.IsErrorLog));
                onError(ex);
                return false  ;
            }

            gWeb = null;

            return true;
        }
        ///���Dzɼ����е����������ַ���ݵ����
        ///���������Ϊ���ࣺһ����һҳ�ĵ������򣻶���ҳ�浼����
        ///�˷��������ַ����Ҫ������һҳ�Ĺ���Ȼ�����ParseGatherNavigationUrl
        ///����ҳ�浼��������
        private bool GatherNavigationUrl(string Url, List<Task.cNavigRule> nRules, bool IsNext, string NextRule)
        {
            cGatherWeb gWeb = new cGatherWeb();
            //gWeb.CutFlag = m_TaskSplitData.CutFlag;
            string NextUrl = Url;
            string Old_Url = NextUrl;
            bool IsSucceed = false;

            try
            {

                if (IsNext)
                {
                    do
                    {
                        if (m_ThreadRunning == true)
                        {
                            Url = NextUrl;
                            Old_Url = NextUrl;

                            e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + "���ڲɼ���" + Url + "\n", this.IsErrorLog));

                            IsSucceed = ParseGatherNavigationUrl(Url,nRules) ; //, NagRule, IsOppPath);

                            e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + "�ɼ���ɣ�" + Url + "\n", this.IsErrorLog));

                            e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + "��ʼ������һҳ�����ȡ��һҳ��ַ\n", this.IsErrorLog));

                            bool IsAjax = false;

                            if (m_TaskType == cGlobalParas.TaskType.AjaxHtmlByUrl)
                                IsAjax = true;

                            string webSource = gWeb.GetHtml(Url, m_WebCode, m_Cookie, "", "",true,IsAjax );

                            string NRule = "((?<=href=[\'|\"])\\S[^#+$<>\\s]*(?=[\'|\"]))[^<]*(?<=" + NextRule + ")";
                            Match charSetMatch = Regex.Match(webSource, NRule, RegexOptions.IgnoreCase | RegexOptions.Multiline);
                            string strNext = charSetMatch.Groups[1].Value;

                            if (strNext != "")
                            {
                                //�жϻ�ȡ�ĵ�ַ�Ƿ�Ϊ��Ե�ַ
                                if (strNext.Substring(0, 1) == "/")
                                {
                                    string PreUrl = Url;
                                    PreUrl = PreUrl.Substring(7, PreUrl.Length - 7);
                                    PreUrl = PreUrl.Substring(0, PreUrl.IndexOf("/"));
                                    PreUrl = "http://" + PreUrl;
                                    strNext = PreUrl + strNext;
                                }
                                else if (strNext.StartsWith("http://", StringComparison.CurrentCultureIgnoreCase))
                                {
                                    NextUrl = strNext;
                                }
                                else if (strNext.StartsWith("?", StringComparison.CurrentCultureIgnoreCase))
                                {
                                    Match aa = Regex.Match(Url, @".*(?=\?)");
                                    string PreUrl = aa.Groups[0].Value.ToString();
                                    strNext = PreUrl + strNext;
                                }
                                else
                                {
                                    Match aa = Regex.Match(Url, ".*/");
                                    string PreUrl = aa.Groups[0].Value.ToString();
                                    strNext = PreUrl + strNext;
                                }

                                e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + "��һҳ��ַ��ȡ�ɹ���" + NextUrl + "\n", this.IsErrorLog));
                            }
                            else
                            {
                                e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + "�Ѿ�������ҳ" + "\n", this.IsErrorLog));
                            }

                            NextUrl = strNext;

                        }
                        else if (m_ThreadRunning == false)
                        {
                            //��ʶҪ����ֹ�̣߳�ֹͣ�����˳�doѭ����ǰ��������
                            if (NextUrl == "" || Old_Url == NextUrl)
                            {
                                return true;
                            }
                            else
                            {
                                return false;
                            }
                            //break;
                        }

                    }
                    while (NextUrl != "" && Old_Url != NextUrl);
                }
                else
                {
                    IsSucceed = ParseGatherNavigationUrl(Url, nRules); //, NagRule, IsOppPath);
                }
            }
            catch (System.Exception ex)
            {
                e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Error).ToString() + Url + "�ɼ���������" + ex.Message + "\n", this.IsErrorLog));
                e_GUrlCount(this, new cGatherUrlCountArgs(m_TaskID, cGlobalParas.UpdateUrlCountType.Err, 0));
                e_GUrlCount(this, new cGatherUrlCountArgs(m_TaskID, cGlobalParas.UpdateUrlCountType.ErrUrlCountAdd, 0));
                m_TaskSplitData.GatheredTrueErrUrlCount++;
                m_TaskSplitData.GatheredErrUrlCount++;
                onError(ex);
                return false;
            }

            gWeb = null;

            return IsSucceed;
        }
        //����һ��ͨѶ�Ľӿڷ����������ɼ�����Ĵ����������Ҫ�ɼ�����ҳ�����ô˷���
        //�ɴ˷�������cGatherWeb.GetGatherData�����η�����Ŀ����Ϊ�˿��Դ����������
        private DataTable GetGatherData(string Url, cGlobalParas.WebCode webCode, string cookie, string startPos, string endPos, string sPath, bool IsAjax)
        {
            cGatherWeb gWeb = new cGatherWeb();
            gWeb.CutFlag = m_TaskSplitData.CutFlag;

            DataTable tmpData ;
            int AgainTime = 0;

            GatherAgain:

            try
            {
                tmpData = gWeb.GetGatherData(Url, m_WebCode, m_Cookie, m_gStartPos, m_gEndPos, m_SavePath, IsAjax);
            }
            catch (System.Exception ex)
            {
                AgainTime++;

                if (AgainTime > m_AgainNumber)
                {
                    if (m_IsErrorLog == true)
                    {
                        //���������־
                    }

                    throw ex;
                }
                else
                {
                    if (m_Ignore404 == true && ex.Message.Contains ("404"))
                    {
                        if (m_IsErrorLog == true)
                        {
                            //���������־
                        }

                        throw ex;
                    }
                    else
                    {
                        e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Error).ToString() + "��ַ��" + Url + "���ʷ������������Ϣ��" + ex.Message + "���ȴ�3������\n", this.IsErrorLog));

                        Thread.Sleep(3000);

                        e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Warning).ToString() + Url + "���ڽ��е�" + AgainTime + "������\n", this.IsErrorLog));

                        //��������
                        goto GatherAgain;
                    }
                }
            }

            return tmpData;
        }
Пример #10
0
        //���ݵ������򣬻�ȡ��ҳ��ַ����һ������
        public List<string> GetUrlsByRule(string Url, string UrlRule,cGlobalParas.WebCode webCode, string cookie)
        {
            string Url1;
            List<string> Urls=new List<string> ();

            if (UrlRule.Trim() == "")
            {
                Urls.Add(Url);
                return Urls;
            }

            //�ж���ַ�Ƿ���ڲ�����������ڲ�����ȡ����һ��������ַ
            if (Regex.IsMatch(Url, "{.*}"))
            {
                List<string> Urls1 = SplitWebUrl(Url );  //,IsUrlEncode ,UrlEncode
                Url1 = Urls1[0].ToString();
            }
            else
            {
                Url1 = Url;
            }

            //������ַ��Դ�룬��������ȡ������ȡ��������ַ
            //string UrlSource= cTool.GetHtmlSource(Url1,true );

            cGatherWeb gW = new cGatherWeb();
            string UrlSource = gW.GetHtml(Url1, webCode, cookie, "", "", true, false);
            gW = null;

            if (UrlSource == "")
            {
                return null ;
            }

            //string Rule=@"(?<=href=[\W])" + cTool.RegexReplaceTrans(UrlRule) + @"(\S[^'"">]*)(?=[\s'""])";
            string Rule = "";

            if (UrlRule.StartsWith("<Regex:"))
            {
                Rule = @"(?<=[href=|src=|open(][\W])";

                //����ǰ׺
                string strPre = UrlRule.Substring(UrlRule.IndexOf("<Regex:")+7, UrlRule.IndexOf(">")-7);
                Rule += strPre;

                //���������
                string cma=@"(?<=<Common:)\S+?(?=>)";

                Regex cmas = new Regex(cma, RegexOptions.IgnoreCase | RegexOptions.Multiline);
                MatchCollection cs = cmas.Matches(UrlRule);
                foreach (Match ma in cs)
                {
                    Rule +=@"(\S*)" + ma.Value.ToString ();
                }

                //�����׺
                if (Regex.IsMatch(UrlRule, "<End:"))
                {
                    string s = UrlRule.Substring(UrlRule.IndexOf("<End:") + 5, UrlRule.Length - UrlRule.IndexOf("<End:") - 6);
                    Rule += @"(\S*)" + s;
                }
                else
                {
                    Rule += @"(\S[^'"">]*)(?=[\s'""])";
                }

            }
            else
            {
                Rule = @"(?<=[href=|src=|open(][\W])" + cTool.RegexReplaceTrans(UrlRule) + @"(\S[^'"">]*)(?=[\s'""])";
            }

            Regex re = new Regex(Rule, RegexOptions.IgnoreCase | RegexOptions.Multiline);
            MatchCollection aa = re.Matches(UrlSource);

            DataTable d1 = new DataTable();
            d1.Columns.Add("Name");

            foreach (Match ma in aa)
            {
                //Urls.Add(ma.Value.ToString());
                d1.Rows.Add(ma.Value.ToString());
            }

            //����ʱ���ܻ��ȡ�ظ���ַ���б��������Ҫȥ��
            //ȥ���ظ���

            string[] strComuns = new string[d1.Columns.Count];

            for (int m = 0; m < d1.Columns.Count; m++)
            {
                strComuns[m] = d1.Columns[m].ColumnName;
            }

            DataView dv = new DataView(d1);

            DataTable d2 = dv.ToTable(true, strComuns);

            for (int i = 0; i < d2.Rows.Count; i++)
            {
                if (string.Compare(d2.Rows[i][0].ToString ().Substring (0,4), "http", true) != 0)
                {
                    string PreUrl = Url;

                    if (d2.Rows[i][0].ToString().Substring(0, 1) == "/")
                    {
                        PreUrl = PreUrl.Substring(7, PreUrl.Length - 7);
                        PreUrl = PreUrl.Substring(0, PreUrl.IndexOf("/"));
                        PreUrl = "http://" + PreUrl;
                    }
                    else
                    {
                        Match a = Regex.Match(PreUrl, ".*/");
                        PreUrl = a.Groups[0].Value.ToString();
                    }

                    Urls.Add(PreUrl + d2.Rows[i][0].ToString());
                }
                else
                {
                    Urls.Add(d2.Rows[i][0].ToString());
                }

            }

            return Urls;
        }
Пример #11
0
        private void GetCopy()
        {
            Gather.cGatherWeb gData = new Gather.cGatherWeb();

            this.textBox1.Text =rm.GetString ("Info90");
            Application.DoEvents();
            Old_Copy = Assembly.GetExecutingAssembly().GetName().Version.ToString();
            this.textBox1.Text += "\r\n" + rm.GetString("Info91") + Assembly.GetExecutingAssembly().GetName().Version;
            Application.DoEvents();
            this.textBox1.Text += "\r\n" + rm.GetString("Info92");
            Application.DoEvents();
            SCode = cTool.GetHtmlSource("http://www.yijie.net/user/soft/updatesoukey.html", true);
            if (SCode == "" || SCode == null)
            {
                this.textBox1.Text += "\r\n" + rm.GetString("Info93") + "\r\n" + rm.GetString("Info94");
                Application.DoEvents();
                return;
            }

            this.textBox1.Text += "\r\n" + rm.GetString("Info95") + "\r\n" + rm.GetString("Info96");
            Application.DoEvents();

            //���Ӳɼ��ı�־
            Task.cWebpageCutFlag c;

            c = new Task.cWebpageCutFlag();
            c.id =0;
            c.Title = "�汾";
            c.DataType =(int) cGlobalParas.GDataType.Txt;
            c.StartPos = "�汾��";
            c.EndPos = "</p>";
            c.LimitSign =(int) cGlobalParas.LimitSign.NoLimit;
            gData.CutFlag.Add(c);
            c = null;

            //���Ӱ汾˵���ı�־
            c = new Task.cWebpageCutFlag();
            c.id = 1;
            c.Title = "˵��";
            c.DataType = (int)cGlobalParas.GDataType.Txt;
            c.StartPos = "˵����";
            c.EndPos = "</p>";
            c.LimitSign = (int)cGlobalParas.LimitSign.NoLimit;
            gData.CutFlag.Add(c);
            c = null;

            DataTable dGather = gData.GetGatherData("http://www.yijie.net/user/soft/updatesoukey.html", cGlobalParas.WebCode.utf8, "", "", "", Program.getPrjPath(),false);

            New_Copy = dGather.Rows[0][0].ToString();
            this.textBox1.Text += "\r\n" + rm.GetString("Info97") + New_Copy;
            Application.DoEvents();

            ///�汾�űȽ���Ҫ�Ƚ��������00.00.00�����а汾�������մ˸�ʽ���������ִ���
            ///�Ƚ�˳��Ϊ�����汾->�Ͱ汾��ֻҪ��һ���°汾�Ŵ��ھɰ汾�ţ���ͽ�����������

            int Old_V;
            int New_V;

            for (int i = 0; i < 3; i++)
            {
                Old_V=int.Parse ( Old_Copy .Substring(0,Old_Copy .IndexOf (".")));
                Old_Copy =Old_Copy .Substring (Old_Copy .IndexOf (".")+1,Old_Copy .Length -Old_Copy .IndexOf (".")-1);

                New_V = int.Parse(New_Copy.Substring(0, New_Copy.IndexOf(".")));
                New_Copy = New_Copy.Substring(New_Copy.IndexOf(".")+1, New_Copy.Length - New_Copy.IndexOf(".")-1);

                if (New_V >Old_V )
                {
                    this.textBox1.Text += "\r\n" + rm.GetString("Info98");
                    Application.DoEvents();

                    this.textBox1.Text += "\r\n" + dGather.Rows [0][1].ToString ();
                    Application.DoEvents();

                    gData = null;

                    this.button2.Enabled = true;
                    this.button1.Enabled = true;
                    return;
                }
            }

            this.textBox1.Text += "\r\n" + rm.GetString("Info99");
            Application.DoEvents();

            this.button1.Enabled = true;
        }
Пример #12
0
        private void GetCopy()
        {
            Gather.cGatherWeb gData = new Gather.cGatherWeb();

            this.textBox1.Text = rm.GetString("Info90");
            Application.DoEvents();
            Old_Copy            = Assembly.GetExecutingAssembly().GetName().Version.ToString();
            this.textBox1.Text += "\r\n" + rm.GetString("Info91") + Assembly.GetExecutingAssembly().GetName().Version;
            Application.DoEvents();
            this.textBox1.Text += "\r\n" + rm.GetString("Info92");
            Application.DoEvents();
            SCode = cTool.GetHtmlSource("http://www.yijie.net/user/soft/updatesoukey.html", true);
            if (SCode == "" || SCode == null)
            {
                this.textBox1.Text += "\r\n" + rm.GetString("Info93") + "\r\n" + rm.GetString("Info94");
                Application.DoEvents();
                return;
            }

            this.textBox1.Text += "\r\n" + rm.GetString("Info95") + "\r\n" + rm.GetString("Info96");
            Application.DoEvents();

            //增加采集的标志
            Task.cWebpageCutFlag c;

            c           = new Task.cWebpageCutFlag();
            c.id        = 0;
            c.Title     = "版本";
            c.DataType  = (int)cGlobalParas.GDataType.Txt;
            c.StartPos  = "版本:";
            c.EndPos    = "</p>";
            c.LimitSign = (int)cGlobalParas.LimitSign.NoLimit;
            gData.CutFlag.Add(c);
            c = null;

            //增加版本说明的标志
            c           = new Task.cWebpageCutFlag();
            c.id        = 1;
            c.Title     = "说明";
            c.DataType  = (int)cGlobalParas.GDataType.Txt;
            c.StartPos  = "说明:";
            c.EndPos    = "</p>";
            c.LimitSign = (int)cGlobalParas.LimitSign.NoLimit;
            gData.CutFlag.Add(c);
            c = null;


            DataTable dGather = gData.GetGatherData("http://www.yijie.net/user/soft/updatesoukey.html", cGlobalParas.WebCode.utf8, "", "", "", Program.getPrjPath(), false);

            New_Copy            = dGather.Rows[0][0].ToString();
            this.textBox1.Text += "\r\n" + rm.GetString("Info97") + New_Copy;
            Application.DoEvents();

            ///版本号比较需要比较三个界别:00.00.00,所有版本必须遵照此格式,否则会出现错误。
            ///比较顺序为:主版本->低版本,只要有一个新版本号大于旧版本号,则就进行升级操作

            int Old_V;
            int New_V;

            for (int i = 0; i < 3; i++)
            {
                Old_V    = int.Parse(Old_Copy.Substring(0, Old_Copy.IndexOf(".")));
                Old_Copy = Old_Copy.Substring(Old_Copy.IndexOf(".") + 1, Old_Copy.Length - Old_Copy.IndexOf(".") - 1);

                New_V    = int.Parse(New_Copy.Substring(0, New_Copy.IndexOf(".")));
                New_Copy = New_Copy.Substring(New_Copy.IndexOf(".") + 1, New_Copy.Length - New_Copy.IndexOf(".") - 1);

                if (New_V > Old_V)
                {
                    this.textBox1.Text += "\r\n" + rm.GetString("Info98");
                    Application.DoEvents();

                    this.textBox1.Text += "\r\n" + dGather.Rows [0][1].ToString();
                    Application.DoEvents();

                    gData = null;

                    this.button2.Enabled = true;
                    this.button1.Enabled = true;
                    return;
                }
            }

            this.textBox1.Text += "\r\n" + rm.GetString("Info99");
            Application.DoEvents();

            this.button1.Enabled = true;
        }
Пример #13
0
        ///这是采集带有导航规则的网址数据的入口
        ///导航规则分为两类:一是下一页的导航规则;而是页面导航,
        ///此方法传入地址后,主要处理下一页的规则,然后调用ParseGatherNavigationUrl
        ///处理页面导航的问题
        private bool GatherNavigationUrl(string Url, List <Task.cNavigRule> nRules, bool IsNext, string NextRule)
        {
            cGatherWeb gWeb = new cGatherWeb();
            //gWeb.CutFlag = m_TaskSplitData.CutFlag;
            string NextUrl   = Url;
            string Old_Url   = NextUrl;
            bool   IsSucceed = false;

            try
            {
                if (IsNext)
                {
                    do
                    {
                        if (m_ThreadRunning == true)
                        {
                            Url     = NextUrl;
                            Old_Url = NextUrl;

                            e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + "正在采集:" + Url + "\n", this.IsErrorLog));

                            IsSucceed = ParseGatherNavigationUrl(Url, nRules); //, NagRule, IsOppPath);

                            e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + "采集完成:" + Url + "\n", this.IsErrorLog));

                            e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + "开始根据下一页规则获取下一页网址\n", this.IsErrorLog));

                            bool IsAjax = false;

                            if (m_TaskType == cGlobalParas.TaskType.AjaxHtmlByUrl)
                            {
                                IsAjax = true;
                            }

                            string webSource = gWeb.GetHtml(Url, m_WebCode, m_Cookie, "", "", true, IsAjax);

                            string NRule        = "((?<=href=[\'|\"])\\S[^#+$<>\\s]*(?=[\'|\"]))[^<]*(?<=" + NextRule + ")";
                            Match  charSetMatch = Regex.Match(webSource, NRule, RegexOptions.IgnoreCase | RegexOptions.Multiline);
                            string strNext      = charSetMatch.Groups[1].Value;

                            if (strNext != "")
                            {
                                //判断获取的地址是否为相对地址
                                if (strNext.Substring(0, 1) == "/")
                                {
                                    string PreUrl = Url;
                                    PreUrl  = PreUrl.Substring(7, PreUrl.Length - 7);
                                    PreUrl  = PreUrl.Substring(0, PreUrl.IndexOf("/"));
                                    PreUrl  = "http://" + PreUrl;
                                    strNext = PreUrl + strNext;
                                }
                                else if (strNext.StartsWith("http://", StringComparison.CurrentCultureIgnoreCase))
                                {
                                    NextUrl = strNext;
                                }
                                else if (strNext.StartsWith("?", StringComparison.CurrentCultureIgnoreCase))
                                {
                                    Match  aa     = Regex.Match(Url, @".*(?=\?)");
                                    string PreUrl = aa.Groups[0].Value.ToString();
                                    strNext = PreUrl + strNext;
                                }
                                else
                                {
                                    Match  aa     = Regex.Match(Url, ".*/");
                                    string PreUrl = aa.Groups[0].Value.ToString();
                                    strNext = PreUrl + strNext;
                                }

                                e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + "下一页网址获取成功:" + NextUrl + "\n", this.IsErrorLog));
                            }
                            else
                            {
                                e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + "已经到最终页" + "\n", this.IsErrorLog));
                            }

                            NextUrl = strNext;
                        }
                        else if (m_ThreadRunning == false)
                        {
                            //标识要求终止线程,停止任务,退出do循环提前结束任务
                            if (NextUrl == "" || Old_Url == NextUrl)
                            {
                                return(true);
                            }
                            else
                            {
                                return(false);
                            }
                            //break;
                        }
                    }while (NextUrl != "" && Old_Url != NextUrl);
                }
                else
                {
                    IsSucceed = ParseGatherNavigationUrl(Url, nRules); //, NagRule, IsOppPath);
                }
            }
            catch (System.Exception ex)
            {
                e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Error).ToString() + Url + "采集发生错误:" + ex.Message + "\n", this.IsErrorLog));
                e_GUrlCount(this, new cGatherUrlCountArgs(m_TaskID, cGlobalParas.UpdateUrlCountType.Err, 0));
                e_GUrlCount(this, new cGatherUrlCountArgs(m_TaskID, cGlobalParas.UpdateUrlCountType.ErrUrlCountAdd, 0));
                m_TaskSplitData.GatheredTrueErrUrlCount++;
                m_TaskSplitData.GatheredErrUrlCount++;
                onError(ex);
                return(false);
            }

            gWeb = null;

            return(IsSucceed);
        }
Пример #14
0
        //用于采集一个网页的数据
        private bool GatherSingleUrl(string Url, bool IsNext, string NextRule)
        {
            cGatherWeb gWeb = new cGatherWeb();
            DataTable  tmpData;
            string     NextUrl = Url;
            string     Old_Url = NextUrl;

            //gWeb.CutFlag = m_TaskSplitData.CutFlag;

            bool IsAjax = false;

            if (m_TaskType == cGlobalParas.TaskType.AjaxHtmlByUrl)
            {
                IsAjax = true;
            }


            try
            {
                if (IsNext)
                {
                    do
                    {
                        Url     = NextUrl;
                        Old_Url = NextUrl;

                        e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + "正在采集:" + Url + "\n", this.IsErrorLog));

                        if (m_IsUrlEncode == true)
                        {
                            Url = cTool.UrlEncode(Url, (cGlobalParas.WebCode) int.Parse(m_UrlEncode));
                        }

                        //tmpData = gWeb.GetGatherData(Url, m_WebCode, m_Cookie, m_gStartPos, m_gEndPos,m_SavePath,IsAjax );
                        tmpData = GetGatherData(Url, m_WebCode, m_Cookie, m_gStartPos, m_gEndPos, m_SavePath, IsAjax);

                        if (tmpData != null)
                        {
                            m_GatherData.Merge(tmpData);
                        }

                        //触发日志及采集数据的事件
                        if (tmpData == null || tmpData.Rows.Count == 0)
                        {
                        }
                        else
                        {
                            e_GData(this, new cGatherDataEventArgs(m_TaskID, tmpData));
                        }
                        e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + "采集完成:" + Url + "\n", this.IsErrorLog));


                        e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + "开始根据下一页规则获取下一页网址\n", this.IsErrorLog));


                        string webSource = gWeb.GetHtml(Url, m_WebCode, m_Cookie, "", "", true, IsAjax);


                        string NRule        = "((?<=href=[\'|\"])\\S[^#+$<>\\s]*(?=[\'|\"]))[^<]*(?<=" + NextRule + ")";
                        Match  charSetMatch = Regex.Match(webSource, NRule, RegexOptions.IgnoreCase | RegexOptions.Multiline);
                        string strNext      = charSetMatch.Groups[1].Value;

                        if (strNext != "")
                        {
                            //判断获取的地址是否为相对地址
                            if (strNext.Substring(0, 1) == "/")
                            {
                                string PreUrl = Url;
                                PreUrl  = PreUrl.Substring(7, PreUrl.Length - 7);
                                PreUrl  = PreUrl.Substring(0, PreUrl.IndexOf("/"));
                                PreUrl  = "http://" + PreUrl;
                                strNext = PreUrl + strNext;
                            }
                            else if (strNext.StartsWith("http://", StringComparison.CurrentCultureIgnoreCase))
                            {
                                //NextUrl = strNext;
                            }
                            else if (strNext.StartsWith("?", StringComparison.CurrentCultureIgnoreCase))
                            {
                                Match  aa     = Regex.Match(Url, @".*(?=\?)");
                                string PreUrl = aa.Groups[0].Value.ToString();
                                strNext = PreUrl + strNext;
                            }
                            else
                            {
                                Match  aa     = Regex.Match(Url, ".*/");
                                string PreUrl = aa.Groups[0].Value.ToString();
                                strNext = PreUrl + strNext;
                            }

                            e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + "下一页网址获取成功:" + NextUrl + "\n", this.IsErrorLog));
                        }
                        else
                        {
                            e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + "已经到最终页" + NextUrl + "\n", this.IsErrorLog));
                        }

                        NextUrl = strNext;
                    }while (NextUrl != "" && Old_Url != NextUrl);
                }
                else
                {
                    if (m_IsUrlEncode == true)
                    {
                        Url = cTool.UrlEncode(Url, (cGlobalParas.WebCode) int.Parse(m_UrlEncode));
                    }

                    //tmpData = gWeb.GetGatherData(Url, m_WebCode, m_Cookie, m_gStartPos, m_gEndPos, m_SavePath, IsAjax);
                    tmpData = GetGatherData(Url, m_WebCode, m_Cookie, m_gStartPos, m_gEndPos, m_SavePath, IsAjax);

                    if (tmpData != null)
                    {
                        m_GatherData.Merge(tmpData);
                    }

                    //触发日志及采集数据的事件
                    if (tmpData == null || tmpData.Rows.Count == 0)
                    {
                    }
                    else
                    {
                        e_GData(this, new cGatherDataEventArgs(m_TaskID, tmpData));
                    }
                    e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + "采集完成:" + Url + "\n", this.IsErrorLog));
                }


                //触发采集网址计数事件
                e_GUrlCount(this, new cGatherUrlCountArgs(m_TaskID, cGlobalParas.UpdateUrlCountType.Gathered, 0));

                m_TaskSplitData.GatheredTrueUrlCount++;
            }
            catch (System.Exception ex)
            {
                e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Error).ToString() + Url + "采集发生错误:" + ex.Message + "\n", this.IsErrorLog));
                e_GUrlCount(this, new cGatherUrlCountArgs(m_TaskID, cGlobalParas.UpdateUrlCountType.Err, 0));
                e_GUrlCount(this, new cGatherUrlCountArgs(m_TaskID, cGlobalParas.UpdateUrlCountType.ErrUrlCountAdd, 0));
                m_TaskSplitData.GatheredTrueErrUrlCount++;
                m_TaskSplitData.GatheredErrUrlCount++;
                onError(ex);
                return(false);
            }

            gWeb    = null;
            tmpData = null;

            return(true);
        }