private void StartList()
        {
            _listLinkUrl.Clear();

            MessageOut($"[{modelTask.TaskName}]开始采集数据!请稍候...");

            var task = new TaskFactory().StartNew(() => {
                //加载为采集的列表
                if (modelTask.IsSpiderUrl == 1)
                {
                    var spiderList   = new SpiderListHelper();
                    spiderList.Model = modelTask;
                    spiderList.OutTreeNodeHandler += (string url, string title, string cover, int nodeIndex) => {
                        var m = new ModelLinkUrl()
                        {
                            Url   = url,
                            Title = title,
                            Cover = cover
                        };
                        bool addFlag = true;
                        foreach (var item in _listLinkUrl.ToArray())
                        {
                            if (item.Url == url)
                            {
                                addFlag = false;
                                break;
                            }
                        }
                        if (addFlag)
                        {
                            string msg = url + "==" + HtmlHelper.Instance.ParseTags(title);
                            if (!DALContentHelper.ChkExistSpiderResult(modelTask.TaskName, url))
                            {
                                _listLinkUrl.Enqueue(m);
                            }
                            else
                            {
                                msg += "采集地址存在!不需要采集!";
                            }
                            MessageOut(msg);
                        }
                    };
                    spiderList.OutMessageHandler += (string msg) => {
                        MessageOut(msg);
                    };
                    spiderList.AnalyzeAllList();

                    MessageOut("分析获取网页个数为" + _listLinkUrl.Count + "个!");
                    MessageOut("采集网站列表完成!");
                }
                else
                {
                    MessageOut("采集列表关闭,不需要采集!");
                }
                OutTaskStatusHandler?.Invoke(EnumTaskType.View);
            });
        }
Esempio n. 2
0
        /// <summary>
        /// 采集网址列表
        /// </summary>
        private void GetAllLinkUrl(string urlList)
        {
            string pageContent = CollectionHelper.Instance.GetHttpPage(urlList, 100000, Encoding.GetEncoding(modelTask.PageEncode));

            if (pageContent == "$StartFalse$" || pageContent == "$EndFalse$")
            {
                MessageOut(urlList + "采集地址失败!结果:" + pageContent);
                return;
            }
            if (modelTask.LinkUrlCutAreaStart != null && modelTask.LinkUrlCutAreaEnd != null)
            {
                pageContent = HtmlHelper.Instance.ParseCollectionStrings(pageContent);
                pageContent = CollectionHelper.Instance.GetBody(pageContent,
                                                                HtmlHelper.Instance.ParseCollectionStrings(modelTask.LinkUrlCutAreaStart),
                                                                HtmlHelper.Instance.ParseCollectionStrings(modelTask.LinkUrlCutAreaEnd),
                                                                false,
                                                                false);
                pageContent = HtmlHelper.Instance.UnParseCollectionStrings(pageContent);
            }
            string regexHref = cRegexHelper.RegexATag;

            if (modelTask.IsHandGetUrl == 1)
            {
                regexHref = modelTask.HandCollectionUrlRegex;
                regexHref = HtmlHelper.Instance.ParseCollectionStrings(regexHref);
                regexHref = regexHref.Replace("\\(\\*)", ".+?");
                regexHref = regexHref.Replace("\\[参数]", "([\\S\\s].*?)");
            }
            Match  mch = null;
            Regex  reg = new Regex(regexHref, RegexOptions.IgnoreCase | RegexOptions.Compiled);
            string url = string.Empty, title = string.Empty;

            for (mch = reg.Match(pageContent); mch.Success; mch = mch.NextMatch())
            {
                Thread.Sleep(1);
                title = mch.Groups[2].Value;
                if (string.IsNullOrEmpty(title))
                {
                    continue;
                }
                url = CollectionHelper.Instance.FormatUrl(urlList, mch.Groups[1].Value);
                url = url.Replace("\\", "");
                bool isLoop = false;
                if (modelTask.LinkUrlMustIncludeStr != null)
                {
                    //包含
                    if (url.IndexOf(Convert.ToString(modelTask.LinkUrlMustIncludeStr)) == -1)
                    {
                        continue;
                    }
                }
                //不包含
                if (modelTask.LinkUrlNoMustIncludeStr != null)
                {
                    foreach (string str in modelTask.LinkUrlNoMustIncludeStr.Split(new string[] { "||" }, StringSplitOptions.RemoveEmptyEntries))
                    {
                        if (url.IndexOf(str) > -1)
                        {
                            isLoop = true;
                            break;
                        }
                    }
                }
                if (isLoop)
                {
                    continue;
                }
                ModelLinkUrl m = new ModelLinkUrl();
                m.Url   = url;
                m.Title = title;
                //添加Url
                bool addFlag = true;
                foreach (var item in _listLinkUrl.ToArray())
                {
                    if (item.Url == url)
                    {
                        addFlag = false;
                        break;
                    }
                }
                if (addFlag)
                {
                    //开始过滤数据库存在的数据
                    string msg = url + "==" + HtmlHelper.Instance.ParseTags(title);
                    if (!DALContentHelper.ChkExistSpiderResult(modelTask.TaskName, url))
                    {
                        _listLinkUrl.Enqueue(m);
                    }
                    else
                    {
                        msg += "采集地址存在!不需要采集!";
                    }
                    MessageOut(msg);
                }
            }
        }
Esempio n. 3
0
        private void Run_ViewUrl(int index, int threadindex)
        {
            if (modelTask.IsSpiderContent == 1)
            {
                if (_listLinkUrl.Count > 0)
                {
                    ProressNum++;
                    if (OutPutTaskProgressBarDelegate != null)
                    {
                        MainEvents.OutPutTaskProgressBarEventArgs ea = new MainEvents.OutPutTaskProgressBarEventArgs();
                        ea.ProgressNum = ProressNum;
                        ea.RecordNum   = TaskCount;
                        ea.TaskIndex   = TaskIndex;
                        OutPutTaskProgressBarDelegate(this, ea);
                    }
                    ModelLinkUrl  mlink = _listLinkUrl.Dequeue();
                    string        url = mlink.Url;
                    string        SQL = string.Empty, cutContent = string.Empty;
                    string        pageContent = CollectionHelper.Instance.GetHttpPage(url, 1000, Encoding.GetEncoding(modelTask.PageEncode));
                    string        title       = CollectionHelper.Instance.CutStr(pageContent, "<title>([\\S\\s]*?)</title>")[0];
                    StringBuilder sb1         = new StringBuilder();
                    StringBuilder sb2         = new StringBuilder();
                    StringBuilder strSql      = new StringBuilder();
                    StringBuilder sb3         = new StringBuilder();
                    foreach (ModelTaskLabel m in modelTask.ListTaskLabel)
                    {
                        string regContent = HtmlHelper.Instance.ParseCollectionStrings(m.LabelNameCutRegex);
                        regContent = CommonHelper.ReplaceSystemRegexTag(regContent);
                        string CutContent = CollectionHelper.Instance.CutStr(pageContent, regContent)[0];
                        #region 替换内容中的链接为远程链接
                        string[] TagImgList = CollectionHelper.Instance.GetImgTag(CutContent);
                        foreach (string tagimg in TagImgList)
                        {
                            if (string.IsNullOrEmpty(tagimg))
                            {
                                break;
                            }
                            //远程连接
                            string newTagImg = CollectionHelper.Instance.FormatUrl(modelTask.TestViewUrl, tagimg);
                            //替换连接
                            CutContent = CutContent.Replace(tagimg, newTagImg);
                            #region 保存远程图片
                            if (m.IsDownResource == 1)
                            {
                                //替换时间格式连接
                                FileInfo fImg = new FileInfo(newTagImg);
                                string   ext  = fImg.Extension;
                                ext = string.IsNullOrEmpty(ext) ? ".jpg" : ext;
                                string newTimeImg = "images/" + DateTime.Now.ToString("yyyyMMddHHmmss") + ext;

                                lock (QueueHelper.lockObj) {
                                    var d = new Dictionary <string, string>();
                                    d.Add(newTagImg, newTimeImg);
                                    QueueHelper.Q_DownImgResource.Enqueue(d);
                                }
                            }
                            #endregion
                        }
                        #endregion
                        if (m.IsLoop == 1)
                        {
                            string[] LabelString = CollectionHelper.Instance.CutStr(pageContent, regContent);
                            foreach (string s in LabelString)
                            {
                                CutContent += s + "$$$$";
                            }
                            int n = CutContent.LastIndexOf("$$$$");
                            CutContent = CutContent.Remove(n, 4);
                        }
                        if (m.IsLinkUrl == 1)
                        {
                            string[] CutContentArr = CutContent.Split(new string[] { "$$$$" }, StringSplitOptions.RemoveEmptyEntries);
                            foreach (string sUrl in CutContentArr)
                            {
                                CutContent = CollectionHelper.Instance.DefiniteUrl(sUrl, modelTask.TestViewUrl);//地址
                                CutContent = CollectionHelper.Instance.GetHttpPage(CutContent, 1000, Encoding.GetEncoding(modelTask.PageEncode));
                                regContent = HtmlHelper.Instance.ParseCollectionStrings(m.LabelValueLinkUrlRegex);
                                regContent = regContent.Replace("\\(\\*)", ".+?");
                                regContent = regContent.Replace("\\[参数]", "([\\S\\s].*?)");
                                CutContent = CollectionHelper.Instance.CutStr(CutContent, regContent)[0];
                            }
                        }
                        #region 标签是分页
                        if (m.IsPager == 1)
                        {
                            regContent = HtmlHelper.Instance.ParseCollectionStrings(m.LabelValuePagerRegex);
                            regContent = regContent.Replace("\\(\\*)", ".+?");
                            regContent = regContent.Replace("\\[参数]", "([\\S\\s].*?)");
                            string[] LabelString = CollectionHelper.Instance.CutStr(pageContent, regContent);

                            foreach (string pageUrl in LabelString)
                            {
                                string url1             = CollectionHelper.Instance.DefiniteUrl(pageUrl, url);
                                string pageContentPager = CollectionHelper.Instance.GetHttpPage(url1, 100000);
                                if (pageContent.Equals("$UrlIsFalse$") || pageContent.Equals("$GetFalse$"))
                                {
                                    CutContent += "=====分页内容=======================================================\r\n";
                                    CutContent += "远程链接内容失败!";
                                }
                                else
                                {
                                    //重新截取标签
                                    string regContent1 = HtmlHelper.Instance.ParseCollectionStrings(m.LabelNameCutRegex);
                                    regContent1 = CommonHelper.ReplaceSystemRegexTag(regContent1);
                                    string CutContent1 = CollectionHelper.Instance.CutStr(pageContentPager, regContent1)[0];

                                    CutContent += "=====分页内容=======================================================\r\n";
                                    CutContent += CutContent1;
                                }
                            }
                        }
                        #endregion
                        #region 过滤Html
                        if (!string.IsNullOrEmpty(m.LabelHtmlRemove))
                        {
                            //CutContent = HtmlHelper.ReplaceNormalHtml(CutContent, model.TestViewUrl, false);
                            string[] arr = m.LabelHtmlRemove.Split(new string[] { "||||" }, StringSplitOptions.RemoveEmptyEntries);
                            foreach (string str in arr)
                            {
                                if (str == "all")
                                {
                                    CutContent = CollectionHelper.Instance.NoHtml(CutContent);
                                    break;
                                }
                                else if (str == "table")
                                {
                                    CutContent = CollectionHelper.Instance.ScriptHtml(CutContent, "table", 2);
                                }
                                else if (str == "font<span>")
                                {
                                    CutContent = CollectionHelper.Instance.ScriptHtml(CutContent, "font", 3);
                                    CutContent = CollectionHelper.Instance.ScriptHtml(CutContent, "span", 3);
                                }
                                else if (str == "a")
                                {
                                    CutContent = CollectionHelper.Instance.ScriptHtml(CutContent, "a", 3);
                                }
                            }
                        }
                        #endregion
                        #region 排除字符
                        if (!string.IsNullOrEmpty(m.LabelRemove))
                        {
                            foreach (string str in m.LabelRemove.Split(new string[] { "$$$$" }, StringSplitOptions.RemoveEmptyEntries))
                            {
                                CutContent = CutContent.Replace(str, "");
                            }
                        }
                        #endregion
                        #region 替换字符
                        if (!string.IsNullOrEmpty(m.LabelReplace))
                        {
                            foreach (string str in m.LabelReplace.Split(new string[] { "$$$$" }, StringSplitOptions.RemoveEmptyEntries))
                            {
                                string[] ListStr = str.Split(new string[] { "||" }, StringSplitOptions.RemoveEmptyEntries);
                                CutContent = CutContent.Replace(ListStr[0], ListStr[1]);
                            }
                        }
                        #endregion
                        sb1.Append("" + m.LabelName.Replace("'", "''") + ",");
                        sb2.Append("'" + CutContent.Replace("'", "''") + "',");
                        if (CutContent.Replace("'", "''").Length < 100)
                        {
                            sb3.Append(" " + m.LabelName.Replace("'", "''") + "='" + CutContent.Replace("'", "''") + "' and");
                        }
                        //添加文件下载功能  开关打开的时候
                        if (m.IsDownResource == 1)
                        {
                            string[] imgExtArr = m.DownResourceExts.Split(new string[] { ";" }, StringSplitOptions.None);
                            foreach (string s in imgExtArr)
                            {
                            }
                            string downImgPath = AppDomain.CurrentDomain.BaseDirectory + "Data\\Collection\\" + modelTask.TaskName + "\\Images\\";
                            CutContent = ImageDownHelper.SaveUrlPics(CutContent, downImgPath);
                        }
                    }

                    string LocalSQLiteName = "Data\\Collection\\" + modelTask.TaskName + "\\SpiderResult.db";
                    string sql             = " Select Count(1) From Content Where HrefSource='" + url + "' ";
                    object o = SQLiteHelper.ExecuteScalar(LocalSQLiteName, sql);
                    if (Convert.ToInt32("0" + o) == 0)
                    {
                        strSql.Append("insert into Content(HrefSource,");
                        strSql.Append(sb1.ToString().Remove(sb1.Length - 1));
                        strSql.Append(")");
                        strSql.Append(" values ('" + url + "',");
                        strSql.Append(sb2.ToString().Remove(sb2.Length - 1));
                        strSql.Append(")");

                        SQLiteHelper.Execute(LocalSQLiteName, strSql.ToString());
                    }



                    title            = title.Replace('\\', ' ').Replace('/', ' ').Split(new char[] { '_' })[0].Split(new char[] { '-' })[0];
                    gatherEv.Message = mlink.Url + "=" + title;
                    GatherWorkDelegate(this, gatherEv);
                }
                else
                {
                    gatherEv.Message = "没有采集到任何地址!不需要采集!";
                    GatherWorkDelegate(this, gatherEv);
                }
                //暂停
                var r        = new Random();
                var stepNext = r.Next(1, 4);
                Thread.Sleep(stepNext * 2000);
            }
        }
Esempio n. 4
0
        /// <summary>
        /// 采集网址列表
        /// </summary>
        private void GetAllLinkUrl(string urlList)
        {
            string pageContent = CollectionHelper.Instance.GetHttpPage(urlList, 100000, Encoding.GetEncoding(modelTask.PageEncode));

            if (pageContent == "$StartFalse$" || pageContent == "$EndFalse$")
            {
                MessageOut(urlList + "采集地址失败!结果:" + pageContent);
                return;
            }
            if (modelTask.LinkUrlCutAreaStart != null && modelTask.LinkUrlCutAreaEnd != null)
            {
                pageContent = HtmlHelper.Instance.ParseCollectionStrings(pageContent);
                pageContent = CollectionHelper.Instance.GetBody(pageContent,
                                                                HtmlHelper.Instance.ParseCollectionStrings(modelTask.LinkUrlCutAreaStart),
                                                                HtmlHelper.Instance.ParseCollectionStrings(modelTask.LinkUrlCutAreaEnd),
                                                                false,
                                                                false);
                pageContent = HtmlHelper.Instance.UnParseCollectionStrings(pageContent);
            }
            string regexHref = cRegexHelper.RegexATag;
            int    i         = 0;

            if (modelTask.IsHandGetUrl == 1)
            {
                regexHref = modelTask.HandCollectionUrlRegex;
                //regexHref = HtmlHelper.Instance.ParseCollectionStrings(regexHref);
                //regexHref = regexHref.Replace("\\(\\*)", ".+?");
                //regexHref = regexHref.Replace("\\[参数]", "([\\S\\s].*?)");
                if (modelTask.LinkSpliceUrlStr.Trim() == "")
                {
                    regexHref = HtmlHelper.Instance.ParseCollectionStrings(regexHref);
                    regexHref = regexHref.Replace("\\(\\*)", ".+?");
                    regexHref = regexHref.Replace("\\[参数]", "([\\S\\s].*?)");
                }
                else
                {
                    regexHref = regexHref.Replace("[", "\\[");
                    regexHref = regexHref.Replace("\\[参数]", "[参数]");
                    regexHref = regexHref.Replace("(*)", ".+?");
                    while (regexHref.IndexOf("[参数]") >= 0)
                    {
                        i++;
                        int tmp = regexHref.IndexOf("[参数]");                      //获取[参数]第一次出现的索引值
                        regexHref = regexHref.Remove(tmp, "[参数]".Length);         //在该索引处删除[参数]
                        regexHref = regexHref.Insert(tmp, "(?<参数" + i + ">.+?)"); // 在该索引出插入112
                    }
                }
            }
            Match  mch = null;
            Regex  reg = new Regex(regexHref, RegexOptions.IgnoreCase | RegexOptions.Compiled);
            string url = string.Empty, title = string.Empty;

            if (modelTask.LinkSpliceUrlStr.Trim() == "")
            {
                for (mch = reg.Match(pageContent); mch.Success; mch = mch.NextMatch())
                {
                    Thread.Sleep(1);
                    title = mch.Groups[2].Value;
                    if (string.IsNullOrEmpty(title))
                    {
                        continue;
                    }
                    url = CollectionHelper.Instance.FormatUrl(urlList, mch.Groups[1].Value);
                    url = url.Replace("\\", "");
                    bool isLoop = false;
                    if (modelTask.LinkUrlMustIncludeStr != null)
                    {
                        //包含
                        if (url.IndexOf(Convert.ToString(modelTask.LinkUrlMustIncludeStr)) == -1)
                        {
                            continue;
                        }
                    }
                    //不包含
                    if (modelTask.LinkUrlNoMustIncludeStr != null)
                    {
                        foreach (string str in modelTask.LinkUrlNoMustIncludeStr.Split(new string[] { "||" }, StringSplitOptions.RemoveEmptyEntries))
                        {
                            if (url.IndexOf(str) > -1)
                            {
                                isLoop = true;
                                break;
                            }
                        }
                    }
                    if (isLoop)
                    {
                        continue;
                    }
                    ModelLinkUrl m = new ModelLinkUrl();
                    m.Url   = url;
                    m.Title = title;
                    //添加Url
                    bool addFlag = true;
                    foreach (var item in _listLinkUrl.ToArray())
                    {
                        if (item.Url == url)
                        {
                            addFlag = false;
                            break;
                        }
                    }
                    if (addFlag)
                    {
                        //开始过滤数据库存在的数据
                        string msg = url + "==" + HtmlHelper.Instance.ParseTags(title);
                        if (!DALContentHelper.ChkExistSpiderResult(modelTask.TaskName, url))
                        {
                            _listLinkUrl.Enqueue(m);
                        }
                        else
                        {
                            msg += "采集地址存在!不需要采集!";
                        }
                        MessageOut(msg);
                    }
                }
            }
            else
            {
                MatchCollection matches = reg.Matches(pageContent);

                for (int j = 0; j < matches.Count; j++)
                {
                    Thread.Sleep(1);

                    Match  match = matches[j];
                    string aurl  = modelTask.LinkSpliceUrlStr;
                    for (int x = 1; x <= i; x++)
                    {
                        aurl = aurl.Replace("[参数" + x.ToString() + "]", match.Groups["参数" + x.ToString()].Value);
                    }
                    url = CollectionHelper.Instance.FormatUrl(urlList, aurl);
                    url = url.Replace("\\", "");
                    bool isLoop = false;
                    if (modelTask.LinkUrlMustIncludeStr != null)
                    {
                        //包含
                        if (url.IndexOf(Convert.ToString(modelTask.LinkUrlMustIncludeStr)) == -1)
                        {
                            continue;
                        }
                    }
                    //不包含
                    if (modelTask.LinkUrlNoMustIncludeStr != null)
                    {
                        foreach (string str in modelTask.LinkUrlNoMustIncludeStr.Split(new string[] { "||" }, StringSplitOptions.RemoveEmptyEntries))
                        {
                            if (url.IndexOf(str) > -1)
                            {
                                isLoop = true;
                                break;
                            }
                        }
                    }
                    if (isLoop)
                    {
                        continue;
                    }
                    ModelLinkUrl m = new ModelLinkUrl();
                    m.Url   = url;
                    m.Title = title;
                    //添加Url
                    bool addFlag = true;
                    foreach (var item in _listLinkUrl.ToArray())
                    {
                        if (item.Url == url)
                        {
                            addFlag = false;
                            break;
                        }
                    }
                    if (addFlag)
                    {
                        //开始过滤数据库存在的数据
                        string msg = url + "==" + HtmlHelper.Instance.ParseTags(title);
                        if (!DALContentHelper.ChkExistSpiderResult(modelTask.TaskName, url))
                        {
                            _listLinkUrl.Enqueue(m);
                        }
                        else
                        {
                            msg += "采集地址存在!不需要采集!";
                        }
                        MessageOut(msg);
                    }
                }
            }


            //for (mch = reg.Match(pageContent); mch.Success; mch = mch.NextMatch()) {
            //    Thread.Sleep(1);
            //    title = mch.Groups[2].Value;
            //    if (string.IsNullOrEmpty(title)) {
            //        continue;
            //    }
            //    url = CollectionHelper.Instance.FormatUrl(urlList, mch.Groups[1].Value);
            //    url = url.Replace("\\", "");
            //    bool isLoop = false;
            //    if (modelTask.LinkUrlMustIncludeStr != null) {
            //        //包含
            //        if (url.IndexOf(Convert.ToString(modelTask.LinkUrlMustIncludeStr)) == -1) {
            //            continue;
            //        }
            //    }
            //    //不包含
            //    if (modelTask.LinkUrlNoMustIncludeStr != null) {
            //        foreach (string str in modelTask.LinkUrlNoMustIncludeStr.Split(new string[] { "||" }, StringSplitOptions.RemoveEmptyEntries)) {
            //            if (url.IndexOf(str) > -1) {
            //                isLoop = true;
            //                break;
            //            }
            //        }
            //    }
            //    if (isLoop) {
            //        continue;
            //    }
            //    ModelLinkUrl m = new ModelLinkUrl();
            //    m.Url = url;
            //    m.Title = title;
            //    //添加Url
            //    bool addFlag = true;
            //    foreach (var item in _listLinkUrl.ToArray()) {
            //        if (item.Url == url) {
            //            addFlag = false;
            //            break;
            //        }
            //    }
            //    if (addFlag) {
            //        //开始过滤数据库存在的数据
            //        string msg = url + "==" + HtmlHelper.Instance.ParseTags(title);
            //        if (!DALContentHelper.ChkExistSpiderResult(modelTask.TaskName, url)) {
            //            _listLinkUrl.Enqueue(m);
            //        }
            //        else {
            //            msg += "采集地址存在!不需要采集!";
            //        }
            //        MessageOut(msg);
            //    }
            //}
        }