private void StartList()
        {
            _listLinkUrl.Clear();

            MessageOut($"[{modelTask.TaskName}]开始采集数据!请稍候...");

            var task = new TaskFactory().StartNew(() => {
                //加载为采集的列表
                if (modelTask.IsSpiderUrl == 1)
                {
                    var spiderList   = new SpiderListHelper();
                    spiderList.Model = modelTask;
                    spiderList.OutTreeNodeHandler += (string url, string title, string cover, int nodeIndex) => {
                        var m = new ModelLinkUrl()
                        {
                            Url   = url,
                            Title = title,
                            Cover = cover
                        };
                        bool addFlag = true;
                        foreach (var item in _listLinkUrl.ToArray())
                        {
                            if (item.Url == url)
                            {
                                addFlag = false;
                                break;
                            }
                        }
                        if (addFlag)
                        {
                            string msg = url + "==" + HtmlHelper.Instance.ParseTags(title);
                            if (!DALContentHelper.ChkExistSpiderResult(modelTask.TaskName, url))
                            {
                                _listLinkUrl.Enqueue(m);
                            }
                            else
                            {
                                msg += "采集地址存在!不需要采集!";
                            }
                            MessageOut(msg);
                        }
                    };
                    spiderList.OutMessageHandler += (string msg) => {
                        MessageOut(msg);
                    };
                    spiderList.AnalyzeAllList();

                    MessageOut("分析获取网页个数为" + _listLinkUrl.Count + "个!");
                    MessageOut("采集网站列表完成!");
                }
                else
                {
                    MessageOut("采集列表关闭,不需要采集!");
                }
                OutTaskStatusHandler?.Invoke(EnumTaskType.View);
            });
        }
示例#2
0
        private void btnSubmit_Click(object sender, EventArgs e)
        {
            string ss = isHtml ? this.htmlEditor.InnerHtml : this.htmlEditor.InnerText;

            DALContentHelper.UpdateContent(this.TaskName, this.Id, this.HeaderText, ss);

            if (OutEdit != null)
            {
                OutEdit(this.Cell, ss);
            }

            this.Hide();
            this.Close();
        }
示例#3
0
        private void frmTaskDataEdit_Load(object sender, EventArgs e)
        {
            if (!string.IsNullOrEmpty(Id))
            {
                object oo = DALContentHelper.GetContent(this.TaskName, this.Id, this.HeaderText);

                if (oo.ToString().IndexOf("/>") == -1)
                {
                    isHtml = false;
                }

                this.htmlEditor.InnerHtml = oo.ToString();
            }
        }
示例#4
0
        private int Bind_DataList()
        {
            if (!string.IsNullOrEmpty(this.TaskName))
            {
                int oCount     = 0;
                int startIndex = (this.Pager.PageCurrent - 1) * this.Pager.PageSize;
                int pageSize   = this.Pager.PageSize;

                DataTable dt = DALContentHelper.GetContentList(this.TaskName, startIndex, pageSize, ref oCount);

                this.Pager.bindingSource.DataSource       = dt;
                this.Pager.bindingNavigator.BindingSource = Pager.bindingSource;

                this.dataGridView_DataList.DataSource = this.Pager.bindingSource;

                return(oCount);
            }
            return(0);
        }
示例#5
0
        /// <summary>
        /// 采集网址列表
        /// </summary>
        private void GetAllLinkUrl(string urlList)
        {
            string pageContent = CollectionHelper.Instance.GetHttpPage(urlList, 100000, Encoding.GetEncoding(modelTask.PageEncode));

            if (pageContent == "$StartFalse$" || pageContent == "$EndFalse$")
            {
                MessageOut(urlList + "采集地址失败!结果:" + pageContent);
                return;
            }
            if (modelTask.LinkUrlCutAreaStart != null && modelTask.LinkUrlCutAreaEnd != null)
            {
                pageContent = HtmlHelper.Instance.ParseCollectionStrings(pageContent);
                pageContent = CollectionHelper.Instance.GetBody(pageContent,
                                                                HtmlHelper.Instance.ParseCollectionStrings(modelTask.LinkUrlCutAreaStart),
                                                                HtmlHelper.Instance.ParseCollectionStrings(modelTask.LinkUrlCutAreaEnd),
                                                                false,
                                                                false);
                pageContent = HtmlHelper.Instance.UnParseCollectionStrings(pageContent);
            }
            string regexHref = cRegexHelper.RegexATag;

            if (modelTask.IsHandGetUrl == 1)
            {
                regexHref = modelTask.HandCollectionUrlRegex;
                regexHref = HtmlHelper.Instance.ParseCollectionStrings(regexHref);
                regexHref = regexHref.Replace("\\(\\*)", ".+?");
                regexHref = regexHref.Replace("\\[参数]", "([\\S\\s].*?)");
            }
            Match  mch = null;
            Regex  reg = new Regex(regexHref, RegexOptions.IgnoreCase | RegexOptions.Compiled);
            string url = string.Empty, title = string.Empty;

            for (mch = reg.Match(pageContent); mch.Success; mch = mch.NextMatch())
            {
                Thread.Sleep(1);
                title = mch.Groups[2].Value;
                if (string.IsNullOrEmpty(title))
                {
                    continue;
                }
                url = CollectionHelper.Instance.FormatUrl(urlList, mch.Groups[1].Value);
                url = url.Replace("\\", "");
                bool isLoop = false;
                if (modelTask.LinkUrlMustIncludeStr != null)
                {
                    //包含
                    if (url.IndexOf(Convert.ToString(modelTask.LinkUrlMustIncludeStr)) == -1)
                    {
                        continue;
                    }
                }
                //不包含
                if (modelTask.LinkUrlNoMustIncludeStr != null)
                {
                    foreach (string str in modelTask.LinkUrlNoMustIncludeStr.Split(new string[] { "||" }, StringSplitOptions.RemoveEmptyEntries))
                    {
                        if (url.IndexOf(str) > -1)
                        {
                            isLoop = true;
                            break;
                        }
                    }
                }
                if (isLoop)
                {
                    continue;
                }
                ModelLinkUrl m = new ModelLinkUrl();
                m.Url   = url;
                m.Title = title;
                //添加Url
                bool addFlag = true;
                foreach (var item in _listLinkUrl.ToArray())
                {
                    if (item.Url == url)
                    {
                        addFlag = false;
                        break;
                    }
                }
                if (addFlag)
                {
                    //开始过滤数据库存在的数据
                    string msg = url + "==" + HtmlHelper.Instance.ParseTags(title);
                    if (!DALContentHelper.ChkExistSpiderResult(modelTask.TaskName, url))
                    {
                        _listLinkUrl.Enqueue(m);
                    }
                    else
                    {
                        msg += "采集地址存在!不需要采集!";
                    }
                    MessageOut(msg);
                }
            }
        }
示例#6
0
        /// <summary>
        /// 采集网址列表
        /// </summary>
        private void GetAllLinkUrl(string urlList)
        {
            string pageContent = CollectionHelper.Instance.GetHttpPage(urlList, 100000, Encoding.GetEncoding(modelTask.PageEncode));

            if (pageContent == "$StartFalse$" || pageContent == "$EndFalse$")
            {
                MessageOut(urlList + "采集地址失败!结果:" + pageContent);
                return;
            }
            if (modelTask.LinkUrlCutAreaStart != null && modelTask.LinkUrlCutAreaEnd != null)
            {
                pageContent = HtmlHelper.Instance.ParseCollectionStrings(pageContent);
                pageContent = CollectionHelper.Instance.GetBody(pageContent,
                                                                HtmlHelper.Instance.ParseCollectionStrings(modelTask.LinkUrlCutAreaStart),
                                                                HtmlHelper.Instance.ParseCollectionStrings(modelTask.LinkUrlCutAreaEnd),
                                                                false,
                                                                false);
                pageContent = HtmlHelper.Instance.UnParseCollectionStrings(pageContent);
            }
            string regexHref = cRegexHelper.RegexATag;
            int    i         = 0;

            if (modelTask.IsHandGetUrl == 1)
            {
                regexHref = modelTask.HandCollectionUrlRegex;
                //regexHref = HtmlHelper.Instance.ParseCollectionStrings(regexHref);
                //regexHref = regexHref.Replace("\\(\\*)", ".+?");
                //regexHref = regexHref.Replace("\\[参数]", "([\\S\\s].*?)");
                if (modelTask.LinkSpliceUrlStr.Trim() == "")
                {
                    regexHref = HtmlHelper.Instance.ParseCollectionStrings(regexHref);
                    regexHref = regexHref.Replace("\\(\\*)", ".+?");
                    regexHref = regexHref.Replace("\\[参数]", "([\\S\\s].*?)");
                }
                else
                {
                    regexHref = regexHref.Replace("[", "\\[");
                    regexHref = regexHref.Replace("\\[参数]", "[参数]");
                    regexHref = regexHref.Replace("(*)", ".+?");
                    while (regexHref.IndexOf("[参数]") >= 0)
                    {
                        i++;
                        int tmp = regexHref.IndexOf("[参数]");                      //获取[参数]第一次出现的索引值
                        regexHref = regexHref.Remove(tmp, "[参数]".Length);         //在该索引处删除[参数]
                        regexHref = regexHref.Insert(tmp, "(?<参数" + i + ">.+?)"); // 在该索引出插入112
                    }
                }
            }
            Match  mch = null;
            Regex  reg = new Regex(regexHref, RegexOptions.IgnoreCase | RegexOptions.Compiled);
            string url = string.Empty, title = string.Empty;

            if (modelTask.LinkSpliceUrlStr.Trim() == "")
            {
                for (mch = reg.Match(pageContent); mch.Success; mch = mch.NextMatch())
                {
                    Thread.Sleep(1);
                    title = mch.Groups[2].Value;
                    if (string.IsNullOrEmpty(title))
                    {
                        continue;
                    }
                    url = CollectionHelper.Instance.FormatUrl(urlList, mch.Groups[1].Value);
                    url = url.Replace("\\", "");
                    bool isLoop = false;
                    if (modelTask.LinkUrlMustIncludeStr != null)
                    {
                        //包含
                        if (url.IndexOf(Convert.ToString(modelTask.LinkUrlMustIncludeStr)) == -1)
                        {
                            continue;
                        }
                    }
                    //不包含
                    if (modelTask.LinkUrlNoMustIncludeStr != null)
                    {
                        foreach (string str in modelTask.LinkUrlNoMustIncludeStr.Split(new string[] { "||" }, StringSplitOptions.RemoveEmptyEntries))
                        {
                            if (url.IndexOf(str) > -1)
                            {
                                isLoop = true;
                                break;
                            }
                        }
                    }
                    if (isLoop)
                    {
                        continue;
                    }
                    ModelLinkUrl m = new ModelLinkUrl();
                    m.Url   = url;
                    m.Title = title;
                    //添加Url
                    bool addFlag = true;
                    foreach (var item in _listLinkUrl.ToArray())
                    {
                        if (item.Url == url)
                        {
                            addFlag = false;
                            break;
                        }
                    }
                    if (addFlag)
                    {
                        //开始过滤数据库存在的数据
                        string msg = url + "==" + HtmlHelper.Instance.ParseTags(title);
                        if (!DALContentHelper.ChkExistSpiderResult(modelTask.TaskName, url))
                        {
                            _listLinkUrl.Enqueue(m);
                        }
                        else
                        {
                            msg += "采集地址存在!不需要采集!";
                        }
                        MessageOut(msg);
                    }
                }
            }
            else
            {
                MatchCollection matches = reg.Matches(pageContent);

                for (int j = 0; j < matches.Count; j++)
                {
                    Thread.Sleep(1);

                    Match  match = matches[j];
                    string aurl  = modelTask.LinkSpliceUrlStr;
                    for (int x = 1; x <= i; x++)
                    {
                        aurl = aurl.Replace("[参数" + x.ToString() + "]", match.Groups["参数" + x.ToString()].Value);
                    }
                    url = CollectionHelper.Instance.FormatUrl(urlList, aurl);
                    url = url.Replace("\\", "");
                    bool isLoop = false;
                    if (modelTask.LinkUrlMustIncludeStr != null)
                    {
                        //包含
                        if (url.IndexOf(Convert.ToString(modelTask.LinkUrlMustIncludeStr)) == -1)
                        {
                            continue;
                        }
                    }
                    //不包含
                    if (modelTask.LinkUrlNoMustIncludeStr != null)
                    {
                        foreach (string str in modelTask.LinkUrlNoMustIncludeStr.Split(new string[] { "||" }, StringSplitOptions.RemoveEmptyEntries))
                        {
                            if (url.IndexOf(str) > -1)
                            {
                                isLoop = true;
                                break;
                            }
                        }
                    }
                    if (isLoop)
                    {
                        continue;
                    }
                    ModelLinkUrl m = new ModelLinkUrl();
                    m.Url   = url;
                    m.Title = title;
                    //添加Url
                    bool addFlag = true;
                    foreach (var item in _listLinkUrl.ToArray())
                    {
                        if (item.Url == url)
                        {
                            addFlag = false;
                            break;
                        }
                    }
                    if (addFlag)
                    {
                        //开始过滤数据库存在的数据
                        string msg = url + "==" + HtmlHelper.Instance.ParseTags(title);
                        if (!DALContentHelper.ChkExistSpiderResult(modelTask.TaskName, url))
                        {
                            _listLinkUrl.Enqueue(m);
                        }
                        else
                        {
                            msg += "采集地址存在!不需要采集!";
                        }
                        MessageOut(msg);
                    }
                }
            }


            //for (mch = reg.Match(pageContent); mch.Success; mch = mch.NextMatch()) {
            //    Thread.Sleep(1);
            //    title = mch.Groups[2].Value;
            //    if (string.IsNullOrEmpty(title)) {
            //        continue;
            //    }
            //    url = CollectionHelper.Instance.FormatUrl(urlList, mch.Groups[1].Value);
            //    url = url.Replace("\\", "");
            //    bool isLoop = false;
            //    if (modelTask.LinkUrlMustIncludeStr != null) {
            //        //包含
            //        if (url.IndexOf(Convert.ToString(modelTask.LinkUrlMustIncludeStr)) == -1) {
            //            continue;
            //        }
            //    }
            //    //不包含
            //    if (modelTask.LinkUrlNoMustIncludeStr != null) {
            //        foreach (string str in modelTask.LinkUrlNoMustIncludeStr.Split(new string[] { "||" }, StringSplitOptions.RemoveEmptyEntries)) {
            //            if (url.IndexOf(str) > -1) {
            //                isLoop = true;
            //                break;
            //            }
            //        }
            //    }
            //    if (isLoop) {
            //        continue;
            //    }
            //    ModelLinkUrl m = new ModelLinkUrl();
            //    m.Url = url;
            //    m.Title = title;
            //    //添加Url
            //    bool addFlag = true;
            //    foreach (var item in _listLinkUrl.ToArray()) {
            //        if (item.Url == url) {
            //            addFlag = false;
            //            break;
            //        }
            //    }
            //    if (addFlag) {
            //        //开始过滤数据库存在的数据
            //        string msg = url + "==" + HtmlHelper.Instance.ParseTags(title);
            //        if (!DALContentHelper.ChkExistSpiderResult(modelTask.TaskName, url)) {
            //            _listLinkUrl.Enqueue(m);
            //        }
            //        else {
            //            msg += "采集地址存在!不需要采集!";
            //        }
            //        MessageOut(msg);
            //    }
            //}
        }