示例#1
0
        //根据指定的任务ID对当前的任务进行分解,如果有导航页,也需要在此进行
        //分解
        //并初始化此任务的关键数据
        private void SplitTask()
        {
            cGatherTaskSplit     dtc;
            List <Task.cWebLink> tWeblink;

            Task.cTask t = new Task.cTask();
            //m_TaskData.TaskID = e.TaskID;

            //根据指定的TaskID加载任务地址信息
            try
            {
                t.LoadTask(Int64.Parse(m_TaskData.TaskID.ToString()));
            }
            catch (System.Exception)
            {
                //调试实体文件加载失败,有可能是文件丢失所造成
                //但还是需要加载一个空信息,以便界面可以显示此丢失的任务
                //这样用户可以通过界面操作删除此任务内容,这是一个针对
                //丢失文件的处理手段
                m_TaskData.SavePath    = "";
                m_TaskData.TaskDemo    = "";
                m_TaskData.StartPos    = "";
                m_TaskData.EndPos      = "";
                m_TaskData.Cookie      = "";
                m_TaskData.WebCode     = cGlobalParas.WebCode.auto;
                m_TaskData.IsLogin     = false;
                m_TaskData.LoginUrl    = "";
                m_TaskData.PublishType = cGlobalParas.PublishType.NoPublish;
                m_TaskData.IsUrlEncode = false;
                m_TaskData.UrlEncode   = "";
                m_TaskData.Weblink     = null;
                m_TaskData.CutFlag     = null;

                return;
            }

            ////加载页面的采集起始位置和终止位置
            ///此两项数据不在taskrun中存储,是在任务的xml文件中存储
            ///但m_TaskData是按照taskrun来加载的数据,所以无法加载此两
            ///项值和采集页面的规则及网址。
            ///为什么从taskrun中加载,是因为在索引taskrun的时候可以显示界面
            ///信息,所以就共用了一个加载信息的内容
            m_TaskData.SavePath    = t.SavePath;
            m_TaskData.TaskDemo    = t.TaskDemo;
            m_TaskData.StartPos    = t.StartPos;
            m_TaskData.EndPos      = t.EndPos;
            m_TaskData.Cookie      = t.Cookie;
            m_TaskData.WebCode     = (cGlobalParas.WebCode) int.Parse(t.WebCode);
            m_TaskData.IsLogin     = t.IsLogin;
            m_TaskData.LoginUrl    = t.LoginUrl;
            m_TaskData.PublishType = (cGlobalParas.PublishType) int.Parse(t.ExportType);
            m_TaskData.IsUrlEncode = t.IsUrlEncode;
            m_TaskData.UrlEncode   = t.UrlEncode;

            m_TaskData.GatherAgainNumber = t.GatherAgainNumber;
            m_TaskData.IsIgnore404       = t.IsIgnore404;
            m_TaskData.IsErrorLog        = t.IsErrorLog;
            m_TaskData.IsDelRepRow       = t.IsDelRepRow;
            m_TaskData.IsTrigger         = t.IsTrigger;
            if (t.IsTrigger == true)
            {
                m_TaskData.TriggerType = t.TriggerType;
                m_TaskData.TriggerTask = t.TriggerTask;
            }

            ////加载网页地址数据及采集标志数据
            ////再次去处理如果带有参数的网址,则需要进行分解
            ////确保加载的网址肯定是一个有效的网址
            ////注意,此时由于有可能分解任务信息,所以,网址数量在此会发生变化,所以,最终还需修改网址数据
            Task.cWebLink    w;
            Task.cUrlAnalyze u = new Task.cUrlAnalyze();

            for (int i = 0; i < t.WebpageLink.Count; i++)
            {
                if (Regex.IsMatch(t.WebpageLink[i].Weblink.ToString(), "{.*}"))
                {
                    List <string> Urls;

                    if (m_TaskData.IsUrlEncode == true)
                    {
                        Urls = u.SplitWebUrl(t.WebpageLink[i].Weblink.ToString());
                    }
                    else
                    {
                        Urls = u.SplitWebUrl(t.WebpageLink[i].Weblink.ToString());
                    }

                    //开始添加m_TaskData.weblink数据
                    for (int j = 0; j < Urls.Count; j++)
                    {
                        w              = new Task.cWebLink();
                        w.IsGathered   = t.WebpageLink[i].IsGathered;
                        w.IsNavigation = t.WebpageLink[i].IsNavigation;
                        w.IsNextpage   = t.WebpageLink[i].IsNextpage;
                        w.NextPageRule = t.WebpageLink[i].NextPageRule;
                        w.Weblink      = Urls[j].ToString();

                        //加载导航数据
                        if (t.WebpageLink[i].IsNavigation == true)
                        {
                            w.NavigRules = t.WebpageLink[i].NavigRules;
                        }

                        m_TaskData.Weblink.Add(w);
                        w = null;
                    }
                }
                else
                {
                    m_TaskData.Weblink.Add(t.WebpageLink[i]);
                }
            }

            u = null;

            m_TaskData.CutFlag = t.WebpageCutFlag;

            string sPath = m_TaskData.SavePath + "\\" + m_TaskData.TaskName + "_file";

            //重新初始化UrlCount
            //m_TaskData.UrlCount = m_TaskData.Weblink.Count;

            //开始进行任务分块,但此任务的Url数必须大于线程数,且线程数>1
            if (m_TaskData.UrlCount > m_TaskData.ThreadCount && m_TaskData.ThreadCount > 1)
            {
                int SplitUrlCount = (int)Math.Ceiling((decimal)m_TaskData.UrlCount / (decimal)m_TaskData.ThreadCount);

                //设置每个分解任务的起始Url索引和终止的Url索引
                int StartIndex = 0;
                int EndIndex   = 0;
                int j          = 0;

                //for (int i = 1; i <= SplitUrlCount; i++)
                for (int i = 1; i <= m_TaskData.ThreadCount; i++)
                {
                    StartIndex = EndIndex;
                    if (i == m_TaskData.ThreadCount)
                    {
                        EndIndex = m_TaskData.Weblink.Count;
                    }
                    else
                    {
                        //EndIndex = i * m_TaskData.ThreadCount;
                        EndIndex = i * SplitUrlCount;
                    }

                    //初始化分解采集任务类
                    dtc             = new cGatherTaskSplit();
                    dtc.TaskManage  = m_TaskManage;
                    dtc.TaskID      = m_TaskData.TaskID;
                    dtc.WebCode     = m_TaskData.WebCode;
                    dtc.IsUrlEncode = m_TaskData.IsUrlEncode;
                    dtc.UrlEncode   = m_TaskData.UrlEncode;
                    dtc.Cookie      = m_TaskData.Cookie;
                    dtc.StartPos    = m_TaskData.StartPos;
                    dtc.EndPos      = m_TaskData.EndPos;
                    dtc.SavePath    = sPath;
                    dtc.AgainNumber = m_TaskData.GatherAgainNumber;
                    dtc.Ignore404   = m_TaskData.IsIgnore404;
                    dtc.IsErrorLog  = m_TaskData.IsErrorLog;

                    tWeblink = new List <Task.cWebLink>();

                    for (j = StartIndex; j < EndIndex; j++)
                    {
                        tWeblink.Add(m_TaskData.Weblink[j]);
                    }

                    //初始化分解的子任务数据
                    dtc.SetSplitData(StartIndex, EndIndex - 1, tWeblink, m_TaskData.CutFlag);

                    m_TaskData.TaskSplitData.Add(dtc.TaskSplitData);

                    tWeblink = null;
                    dtc      = null;
                }
            }
            else
            {
                //初始化分解采集任务类
                dtc             = new cGatherTaskSplit();
                dtc.TaskManage  = m_TaskManage;
                dtc.TaskID      = m_TaskData.TaskID;
                dtc.WebCode     = m_TaskData.WebCode;
                dtc.IsUrlEncode = m_TaskData.IsUrlEncode;
                dtc.UrlEncode   = m_TaskData.UrlEncode;
                dtc.Cookie      = m_TaskData.Cookie;
                dtc.StartPos    = m_TaskData.StartPos;
                dtc.EndPos      = m_TaskData.EndPos;
                dtc.SavePath    = sPath;
                dtc.AgainNumber = m_TaskData.GatherAgainNumber;
                dtc.Ignore404   = m_TaskData.IsIgnore404;
                dtc.IsErrorLog  = m_TaskData.IsErrorLog;


                dtc.SetSplitData(0, m_TaskData.UrlCount - 1, m_TaskData.Weblink, m_TaskData.CutFlag);
                m_TaskData.TaskSplitData.Add(dtc.TaskSplitData);
                //m_list_GatherTaskSplit.Add(dtc);
            }

            t   = null;
            dtc = null;
        }
        //���ڲɼ���Ҫ��������ҳ���ڴ˴������ҳ����
        private bool ParseGatherNavigationUrl(string Url, List<Task.cNavigRule> nRules)
        {
            Task.cUrlAnalyze u = new Task.cUrlAnalyze();
            List<string> gUrls;
            bool IsSucceed = false;

            e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + "��ʼ���ݵ��������ȡ��ҳ��ַ����ȴ�......\n�����㼶Ϊ��" + nRules.Count + " ��\n", this.IsErrorLog));

            gUrls = u.ParseUrlRule(Url, nRules,m_WebCode ,m_Cookie );

            u = null;
            if (gUrls == null || gUrls.Count == 0)
            {
                e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + Url + " ��������ʧ�ܣ��п��������ڵ����������ô���Ҳ�п�������������������ɣ�������������ݣ���Ӱ��ϵͳ�����ݵIJɼ�\n", this.IsErrorLog));
                return false;
            }

            e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + "�ɹ����ݵ��������ȡ" + gUrls.Count + "����ַ\n", this.IsErrorLog));

            //����ʵ�ʲɼ���ַ���������ǵ���ҳ�棬����ʵ�ʲɼ���ַ���������˱仯
            //ͨ���¼�������������IJɼ�����������ͬʱ����������IJɼ�����
            //ע�⣬������ʵ�ʲɼ���ַ������������������ַ��������������ֵ������ά�����Ե�ҵ���߼�����

            //ϵͳ���������񵼺��ķֽ��������ʱ���Ѿ��޸�����Ҫ�ɼ���������������ԣ���Ҫ�����������ʵ�ʲɼ���ַ������
            //ͬʱ���败����Ӧ���¼��޸���������IJɼ���ַ������
            m_TaskSplitData.TrueUrlCount += gUrls.Count - 1;
            e_GUrlCount(this, new cGatherUrlCountArgs(m_TaskID, cGlobalParas.UpdateUrlCountType.ReIni, gUrls.Count));

            for (int j = 0; j < gUrls.Count; j++)
            {
                if (m_ThreadRunning == true)
                {
                    try
                    {
                        if (string.Compare(gUrls[j].Substring(0, 4), "http", true) != 0)
                        {
                            string PreUrl = Url;

                            if (gUrls[j].Substring(0, 1) == "/")
                            {
                                PreUrl = PreUrl.Substring(7, PreUrl.Length - 7);
                                PreUrl = PreUrl.Substring(0, PreUrl.IndexOf("/"));
                                PreUrl = "http://" + PreUrl;
                            }
                            else
                            {
                                Match aa = Regex.Match(PreUrl, ".*/");
                                PreUrl = aa.Groups[0].Value.ToString();
                            }

                           IsSucceed= GatherParsedUrl(PreUrl + gUrls[j].ToString());
                        }
                        else
                        {
                           IsSucceed= GatherParsedUrl(gUrls[j].ToString());
                        }

                        //�����ɼ���ַ�����¼�
                        e_GUrlCount(this, new cGatherUrlCountArgs(m_TaskID, cGlobalParas.UpdateUrlCountType.Gathered, 0));
                        m_TaskSplitData.GatheredTrueUrlCount++;

                    }
                    catch (System.Exception ex)
                    {
                        e_GUrlCount(this, new cGatherUrlCountArgs(m_TaskID, cGlobalParas.UpdateUrlCountType.Err, 0));
                        m_TaskSplitData.GatheredTrueErrUrlCount++;
                        onError(ex);
                    }
                }
                else if (m_ThreadRunning == false)
                {
                    //��ʶҪ����ֹ�̣߳�ֹͣ�����˳�forѭ����ǰ��������
                    if (j == gUrls.Count)
                    {
                        //��ʾ���Dzɼ������
                        return true;
                    }
                    else
                    {
                        return false;
                    }
                    //break;
                }

            }

            return true;
        }
示例#3
0
        //用于采集需要导航的网页,在此处理导航页规则
        private bool ParseGatherNavigationUrl(string Url, List <Task.cNavigRule> nRules)
        {
            Task.cUrlAnalyze u = new Task.cUrlAnalyze();
            List <string>    gUrls;
            bool             IsSucceed = false;

            e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + "开始根据导航规则获取网页地址,请等待......\n导航层级为:" + nRules.Count + " 层\n", this.IsErrorLog));

            gUrls = u.ParseUrlRule(Url, nRules, m_WebCode, m_Cookie);

            u = null;
            if (gUrls == null || gUrls.Count == 0)
            {
                e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + Url + " 导航解析失败,有可能是由于导航规则配置错误,也有可能是由于垃圾数据造成,如果是垃圾数据,则不影响系统对数据的采集\n", this.IsErrorLog));
                return(false);
            }

            e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + "成功根据导航规则获取" + gUrls.Count + "个网址\n", this.IsErrorLog));

            //更新实际采集网址总数,因是导航页面,所以实际采集网址总数发生了变化
            //通过事件触发更新任务的采集数量总数,同时更新子任务的采集总数
            //注意,仅更新实际采集网址的总数,但不更新网址总数,此是两个值,各自维护各自的业务逻辑处理

            //系统进行了任务导航的分解操作,此时,已经修改了需要采集任务的总数,所以,需要更新子任务的实际采集网址的数量
            //同时还需触发相应的事件修改整个任务的采集网址的总数
            m_TaskSplitData.TrueUrlCount += gUrls.Count - 1;
            e_GUrlCount(this, new cGatherUrlCountArgs(m_TaskID, cGlobalParas.UpdateUrlCountType.ReIni, gUrls.Count));

            for (int j = 0; j < gUrls.Count; j++)
            {
                if (m_ThreadRunning == true)
                {
                    try
                    {
                        if (string.Compare(gUrls[j].Substring(0, 4), "http", true) != 0)
                        {
                            string PreUrl = Url;

                            if (gUrls[j].Substring(0, 1) == "/")
                            {
                                PreUrl = PreUrl.Substring(7, PreUrl.Length - 7);
                                PreUrl = PreUrl.Substring(0, PreUrl.IndexOf("/"));
                                PreUrl = "http://" + PreUrl;
                            }
                            else
                            {
                                Match aa = Regex.Match(PreUrl, ".*/");
                                PreUrl = aa.Groups[0].Value.ToString();
                            }

                            IsSucceed = GatherParsedUrl(PreUrl + gUrls[j].ToString());
                        }
                        else
                        {
                            IsSucceed = GatherParsedUrl(gUrls[j].ToString());
                        }


                        //触发采集网址计数事件
                        e_GUrlCount(this, new cGatherUrlCountArgs(m_TaskID, cGlobalParas.UpdateUrlCountType.Gathered, 0));
                        m_TaskSplitData.GatheredTrueUrlCount++;
                    }
                    catch (System.Exception ex)
                    {
                        e_GUrlCount(this, new cGatherUrlCountArgs(m_TaskID, cGlobalParas.UpdateUrlCountType.Err, 0));
                        m_TaskSplitData.GatheredTrueErrUrlCount++;
                        onError(ex);
                    }
                }
                else if (m_ThreadRunning == false)
                {
                    //标识要求终止线程,停止任务,退出for循环提前结束任务
                    if (j == gUrls.Count)
                    {
                        //表示还是采集完成了
                        return(true);
                    }
                    else
                    {
                        return(false);
                    }
                    //break;
                }
            }

            return(true);
        }