private void Run(int index, int threadindex)
        {
            if (ModelTask.IsSpiderContent == 1)
            {
                if (ListLinkUrl.Count > 0)
                {
                    ProressNum++;
                    if (OutPutTaskProgressBarDelegate != null)
                    {
                        MainEvents.OutPutTaskProgressBarEventArgs ea = new MainEvents.OutPutTaskProgressBarEventArgs();
                        ea.ProgressNum = ProressNum;
                        ea.RecordNum   = TaskCount;
                        ea.TaskIndex   = TaskIndex;
                        OutPutTaskProgressBarDelegate(this, ea);
                    }
                    ModelLinkUrl  mlink = ListLinkUrl.Dequeue();
                    string        url = mlink.Url;
                    string        SQL = string.Empty, cutContent = string.Empty;
                    string        pageContent = CollectionHelper.Instance.GetHttpPage(url, 1000, Encoding.GetEncoding(ModelTask.PageEncode));
                    string        title       = CollectionHelper.Instance.CutStr(pageContent, "<title>([\\S\\s]*?)</title>")[0];
                    StringBuilder sb1         = new StringBuilder();
                    StringBuilder sb2         = new StringBuilder();
                    StringBuilder strSql      = new StringBuilder();
                    StringBuilder sb3         = new StringBuilder();
                    foreach (ModelTaskLabel m in ModelTask.ListTaskLabel)
                    {
                        string regContent = HtmlHelper.Instance.ParseCollectionStrings(m.LabelNameCutRegex);
                        regContent = CommonHelper.ReplaceSystemRegexTag(regContent);
                        string CutContent = CollectionHelper.Instance.CutStr(pageContent, regContent)[0];
                        #region 替换内容中的链接为远程链接
                        string[] TagImgList = CollectionHelper.Instance.GetImgTag(CutContent);
                        foreach (string tagimg in TagImgList)
                        {
                            if (string.IsNullOrEmpty(tagimg))
                            {
                                break;
                            }
                            string newTagImg = CollectionHelper.Instance.FormatUrl(ModelTask.TestViewUrl, tagimg);
                            CutContent = CutContent.Replace(tagimg, newTagImg);
                            #region 保存远程图片
                            #endregion
                        }
                        #endregion
                        if (m.IsLoop == 1)
                        {
                            string[] LabelString = CollectionHelper.Instance.CutStr(pageContent, regContent);
                            foreach (string s in LabelString)
                            {
                                CutContent += s + "$$$$";
                            }
                            int n = CutContent.LastIndexOf("$$$$");
                            CutContent = CutContent.Remove(n, 4);
                        }
                        if (m.IsLinkUrl == 1)
                        {
                            string[] CutContentArr = CutContent.Split(new string[] { "$$$$" }, StringSplitOptions.RemoveEmptyEntries);
                            foreach (string sUrl in CutContentArr)
                            {
                                CutContent = CollectionHelper.Instance.DefiniteUrl(sUrl, ModelTask.TestViewUrl);//地址
                                CutContent = CollectionHelper.Instance.GetHttpPage(CutContent, 1000, Encoding.GetEncoding(ModelTask.PageEncode));
                                regContent = HtmlHelper.Instance.ParseCollectionStrings(m.LabelValueLinkUrlRegex);
                                regContent = regContent.Replace("\\(\\*)", ".+?");
                                regContent = regContent.Replace("\\[参数]", "([\\S\\s].*?)");
                                CutContent = CollectionHelper.Instance.CutStr(CutContent, regContent)[0];
                            }
                        }
                        #region 过滤Html
                        if (!string.IsNullOrEmpty(m.LabelHtmlRemove))
                        {
                            //CutContent = HtmlHelper.ReplaceNormalHtml(CutContent, model.TestViewUrl, false);
                            string[] arr = m.LabelHtmlRemove.Split(new string[] { "||||" }, StringSplitOptions.RemoveEmptyEntries);
                            foreach (string str in arr)
                            {
                                if (str == "all")
                                {
                                    CutContent = CollectionHelper.Instance.NoHtml(CutContent);
                                    break;
                                }
                                else if (str == "table")
                                {
                                    CutContent = CollectionHelper.Instance.ScriptHtml(CutContent, "table", 2);
                                }
                                else if (str == "font<span>")
                                {
                                    CutContent = CollectionHelper.Instance.ScriptHtml(CutContent, "font", 3);
                                    CutContent = CollectionHelper.Instance.ScriptHtml(CutContent, "span", 3);
                                }
                                else if (str == "a")
                                {
                                    CutContent = CollectionHelper.Instance.ScriptHtml(CutContent, "a", 3);
                                }
                            }
                        }
                        #endregion
                        #region 排除字符
                        if (!string.IsNullOrEmpty(m.LabelRemove))
                        {
                            foreach (string str in m.LabelRemove.Split(new string[] { "$$$$" }, StringSplitOptions.RemoveEmptyEntries))
                            {
                                CutContent = CutContent.Replace(str, "");
                            }
                        }
                        #endregion
                        #region 替换字符
                        if (!string.IsNullOrEmpty(m.LabelReplace))
                        {
                            foreach (string str in m.LabelReplace.Split(new string[] { "$$$$" }, StringSplitOptions.RemoveEmptyEntries))
                            {
                                string[] ListStr = str.Split(new string[] { "||" }, StringSplitOptions.RemoveEmptyEntries);
                                CutContent = CutContent.Replace(ListStr[0], ListStr[1]);
                            }
                        }
                        #endregion
                        sb1.Append("" + m.LabelName.Replace("'", "''") + ",");
                        sb2.Append("'" + CutContent.Replace("'", "''") + "',");
                        if (CutContent.Replace("'", "''").Length < 100)
                        {
                            sb3.Append(" " + m.LabelName.Replace("'", "''") + "='" + CutContent.Replace("'", "''") + "' and");
                        }
                        //添加文件下载功能  开关打开的时候
                        if (m.IsDownResource == 1)
                        {
                            string[] imgExtArr = m.DownResourceExts.Split(new string[] { ";" }, StringSplitOptions.None);
                            foreach (string s in imgExtArr)
                            {
                            }
                            string downImgPath = AppDomain.CurrentDomain.BaseDirectory + "Data\\Collection\\" + ModelTask.TaskName + "\\Images\\";
                            CutContent = ImageDownHelper.SaveUrlPics(CutContent, downImgPath);
                        }
                    }

                    strSql.Append("insert into Content(HrefSource,");
                    strSql.Append(sb1.ToString().Remove(sb1.Length - 1));
                    strSql.Append(")");
                    strSql.Append(" values ('" + url + "',");
                    strSql.Append(sb2.ToString().Remove(sb2.Length - 1));
                    strSql.Append(")");

                    string LocalSQLiteName = "Data\\Collection\\" + ModelTask.TaskName + "\\SpiderResult.db";
                    SQLiteHelper.Execute(LocalSQLiteName, strSql.ToString());
                    title            = title.Replace('\\', ' ').Replace('/', ' ').Split(new char[] { '_' })[0].Split(new char[] { '-' })[0];
                    gatherEv.Message = mlink.Url + "=" + title;
                    GatherWorkDelegate(this, gatherEv);
                }
                else
                {
                    MessageShow("没有采集到任何地址!不需要采集!");
                }
            }
        }
        /// <summary>
        /// 获取标签内容
        /// </summary>
        /// <param name="taskName"></param>
        /// <param name="collectionContentStepTime"></param>
        /// <param name="spiderViewUrl"></param>
        /// <param name="itemTaskLebel"></param>
        /// <param name="pageContent"></param>
        /// <param name="isTest"></param>
        /// <returns></returns>
        private string GetLabelContent(string taskName, int collectionContentStepTime, string spiderViewUrl, ModelTaskLabel itemTaskLebel, string pageContent, bool isTest = false)
        {
            var remoteViewUrl = itemTaskLebel.TestViewUrl;

            if (string.IsNullOrEmpty(itemTaskLebel.TestViewUrl))
            {
                remoteViewUrl = spiderViewUrl;
            }

            string regContent = HtmlHelper.Instance.ParseCollectionStrings(itemTaskLebel.LabelNameCutRegex);

            regContent = CommonHelper.ReplaceSystemRegexTag(regContent);
            string CutContent = CollectionHelper.Instance.CutStr(pageContent, regContent)[0];

            #region  载资源
            var imgTag = ImageDownHelper.GetImgTag(CutContent);
            if (itemTaskLebel.IsDownResource == 1)
            {
                string[] imgExtArr   = itemTaskLebel.DownResourceExts.Split(new string[] { ";" }, StringSplitOptions.RemoveEmptyEntries);
                var      downImgPath = AppDomain.CurrentDomain.BaseDirectory + "Data\\Collection\\" + taskName + "\\Images\\";
                int      ii          = 1;
                foreach (var img in imgTag)
                {
                    var remoteImg = CollectionHelper.Instance.FormatUrl(remoteViewUrl, img);
                    var newImg    = DateTime.Now.ToString("yyyyMMddHHmmssffffff") + "_" + ii + ".jpg";
                    if (!string.IsNullOrEmpty(itemTaskLebel.DownResourceExts))
                    {
                        var imgExt = remoteImg.Substring(remoteImg.LastIndexOf("."));
                        if (imgExtArr.SingleOrDefault(x => x.ToLower() == imgExt.ToLower()) != imgExt.ToLower())
                        {
                            continue;
                        }
                    }
                    CutContent = CutContent.Replace(img, downImgPath + newImg);
                    if (!isTest)
                    {
                        QueueImgHelper.AddImg(Model.ID, downImgPath + newImg, remoteImg, collectionContentStepTime);
                    }
                    ii++;
                }
            }
            else
            {
                foreach (var img in imgTag)
                {
                    var remoteImg = CollectionHelper.Instance.FormatUrl(remoteViewUrl, img);
                    CutContent = CutContent.Replace(img, remoteImg);
                }
            }
            #endregion

            #region 结果为循环
            if (itemTaskLebel.IsLoop == 1)
            {
                string[] LabelString = CollectionHelper.Instance.CutStr(pageContent, regContent);
                foreach (string s in LabelString)
                {
                    CutContent += s + "$$$$";
                }
                int n = CutContent.LastIndexOf("$$$$");
                CutContent = CutContent.Remove(n, 4);
            }
            #endregion

            #region 过滤Html
            if (!string.IsNullOrEmpty(itemTaskLebel.LabelHtmlRemove))
            {
                string[] arr = itemTaskLebel.LabelHtmlRemove.Split(new string[] { "||||" }, StringSplitOptions.RemoveEmptyEntries);
                foreach (string str in arr)
                {
                    if (str == "all")
                    {
                        CutContent = CollectionHelper.Instance.NoHtml(CutContent);
                        break;
                    }
                    else if (str == "table")
                    {
                        CutContent = CollectionHelper.Instance.ScriptHtml(CutContent, "table", 2);
                    }
                    else if (str == "font<span>")
                    {
                        CutContent = CollectionHelper.Instance.ScriptHtml(CutContent, "font", 3);
                        CutContent = CollectionHelper.Instance.ScriptHtml(CutContent, "span", 3);
                    }
                    else if (str == "a")
                    {
                        CutContent = CollectionHelper.Instance.ScriptHtml(CutContent, "a", 3);
                    }
                }
            }
            #endregion

            #region 排除字符
            if (!string.IsNullOrEmpty(itemTaskLebel.LabelRemove))
            {
                foreach (string str in itemTaskLebel.LabelRemove.Split(new string[] { "$$$$" }, StringSplitOptions.RemoveEmptyEntries))
                {
                    string[] ListStr = str.Split(new string[] { "||" }, StringSplitOptions.RemoveEmptyEntries);
                    if (ListStr[1] == "1")
                    {
                        CutContent = CollectionHelper.RemoveHtml(CutContent, ListStr[0]);
                    }
                    else
                    {
                        CutContent = CutContent.Replace(ListStr[0], "");
                    }
                }
            }
            #endregion

            #region 替换字符
            if (!string.IsNullOrEmpty(itemTaskLebel.LabelReplace))
            {
                foreach (string str in itemTaskLebel.LabelReplace.Split(new string[] { "$$$$" }, StringSplitOptions.RemoveEmptyEntries))
                {
                    string[] ListStr = str.Split(new string[] { "||" }, StringSplitOptions.RemoveEmptyEntries);
                    CutContent = CutContent.Replace(ListStr[0], ListStr[1]);
                }
            }
            #endregion

            #region 加载插件
            string SpiderLabelPlugin = itemTaskLebel.SpiderLabelPlugin;
            if (SpiderLabelPlugin != "不使用插件" && !string.IsNullOrEmpty(SpiderLabelPlugin))
            {
                CutContent = PythonExtHelper.RunPython(PluginUtility.SpiderContentPluginPath + SpiderLabelPlugin, new object[] { remoteViewUrl, CutContent });
            }
            #endregion

            return(CutContent);
        }
Ejemplo n.º 3
0
        private void Run_ViewUrl(int index, int threadindex)
        {
            if (modelTask.IsSpiderContent == 1)
            {
                if (_listLinkUrl.Count > 0)
                {
                    ProressNum++;
                    if (OutPutTaskProgressBarDelegate != null)
                    {
                        MainEvents.OutPutTaskProgressBarEventArgs ea = new MainEvents.OutPutTaskProgressBarEventArgs();
                        ea.ProgressNum = ProressNum;
                        ea.RecordNum   = TaskCount;
                        ea.TaskIndex   = TaskIndex;
                        OutPutTaskProgressBarDelegate(this, ea);
                    }
                    ModelLinkUrl  mlink = _listLinkUrl.Dequeue();
                    string        url = mlink.Url;
                    string        SQL = string.Empty, cutContent = string.Empty;
                    string        pageContent = CollectionHelper.Instance.GetHttpPage(url, 1000, Encoding.GetEncoding(modelTask.PageEncode));
                    string        title       = CollectionHelper.Instance.CutStr(pageContent, "<title>([\\S\\s]*?)</title>")[0];
                    StringBuilder sb1         = new StringBuilder();
                    StringBuilder sb2         = new StringBuilder();
                    StringBuilder strSql      = new StringBuilder();
                    StringBuilder sb3         = new StringBuilder();
                    foreach (ModelTaskLabel m in modelTask.ListTaskLabel)
                    {
                        string regContent = HtmlHelper.Instance.ParseCollectionStrings(m.LabelNameCutRegex);
                        regContent = CommonHelper.ReplaceSystemRegexTag(regContent);
                        string CutContent = CollectionHelper.Instance.CutStr(pageContent, regContent)[0];
                        #region 替换内容中的链接为远程链接
                        string[] TagImgList = CollectionHelper.Instance.GetImgTag(CutContent);
                        foreach (string tagimg in TagImgList)
                        {
                            if (string.IsNullOrEmpty(tagimg))
                            {
                                break;
                            }
                            //远程连接
                            string newTagImg = CollectionHelper.Instance.FormatUrl(modelTask.TestViewUrl, tagimg);
                            //替换连接
                            CutContent = CutContent.Replace(tagimg, newTagImg);
                            #region 保存远程图片
                            if (m.IsDownResource == 1)
                            {
                                //替换时间格式连接
                                FileInfo fImg = new FileInfo(newTagImg);
                                string   ext  = fImg.Extension;
                                ext = string.IsNullOrEmpty(ext) ? ".jpg" : ext;
                                string newTimeImg = "images/" + DateTime.Now.ToString("yyyyMMddHHmmss") + ext;

                                lock (QueueHelper.lockObj) {
                                    var d = new Dictionary <string, string>();
                                    d.Add(newTagImg, newTimeImg);
                                    QueueHelper.Q_DownImgResource.Enqueue(d);
                                }
                            }
                            #endregion
                        }
                        #endregion
                        if (m.IsLoop == 1)
                        {
                            string[] LabelString = CollectionHelper.Instance.CutStr(pageContent, regContent);
                            foreach (string s in LabelString)
                            {
                                CutContent += s + "$$$$";
                            }
                            int n = CutContent.LastIndexOf("$$$$");
                            CutContent = CutContent.Remove(n, 4);
                        }
                        if (m.IsLinkUrl == 1)
                        {
                            string[] CutContentArr = CutContent.Split(new string[] { "$$$$" }, StringSplitOptions.RemoveEmptyEntries);
                            foreach (string sUrl in CutContentArr)
                            {
                                CutContent = CollectionHelper.Instance.DefiniteUrl(sUrl, modelTask.TestViewUrl);//地址
                                CutContent = CollectionHelper.Instance.GetHttpPage(CutContent, 1000, Encoding.GetEncoding(modelTask.PageEncode));
                                regContent = HtmlHelper.Instance.ParseCollectionStrings(m.LabelValueLinkUrlRegex);
                                regContent = regContent.Replace("\\(\\*)", ".+?");
                                regContent = regContent.Replace("\\[参数]", "([\\S\\s].*?)");
                                CutContent = CollectionHelper.Instance.CutStr(CutContent, regContent)[0];
                            }
                        }
                        #region 标签是分页
                        if (m.IsPager == 1)
                        {
                            regContent = HtmlHelper.Instance.ParseCollectionStrings(m.LabelValuePagerRegex);
                            regContent = regContent.Replace("\\(\\*)", ".+?");
                            regContent = regContent.Replace("\\[参数]", "([\\S\\s].*?)");
                            string[] LabelString = CollectionHelper.Instance.CutStr(pageContent, regContent);

                            foreach (string pageUrl in LabelString)
                            {
                                string url1             = CollectionHelper.Instance.DefiniteUrl(pageUrl, url);
                                string pageContentPager = CollectionHelper.Instance.GetHttpPage(url1, 100000);
                                if (pageContent.Equals("$UrlIsFalse$") || pageContent.Equals("$GetFalse$"))
                                {
                                    CutContent += "=====分页内容=======================================================\r\n";
                                    CutContent += "远程链接内容失败!";
                                }
                                else
                                {
                                    //重新截取标签
                                    string regContent1 = HtmlHelper.Instance.ParseCollectionStrings(m.LabelNameCutRegex);
                                    regContent1 = CommonHelper.ReplaceSystemRegexTag(regContent1);
                                    string CutContent1 = CollectionHelper.Instance.CutStr(pageContentPager, regContent1)[0];

                                    CutContent += "=====分页内容=======================================================\r\n";
                                    CutContent += CutContent1;
                                }
                            }
                        }
                        #endregion
                        #region 过滤Html
                        if (!string.IsNullOrEmpty(m.LabelHtmlRemove))
                        {
                            //CutContent = HtmlHelper.ReplaceNormalHtml(CutContent, model.TestViewUrl, false);
                            string[] arr = m.LabelHtmlRemove.Split(new string[] { "||||" }, StringSplitOptions.RemoveEmptyEntries);
                            foreach (string str in arr)
                            {
                                if (str == "all")
                                {
                                    CutContent = CollectionHelper.Instance.NoHtml(CutContent);
                                    break;
                                }
                                else if (str == "table")
                                {
                                    CutContent = CollectionHelper.Instance.ScriptHtml(CutContent, "table", 2);
                                }
                                else if (str == "font<span>")
                                {
                                    CutContent = CollectionHelper.Instance.ScriptHtml(CutContent, "font", 3);
                                    CutContent = CollectionHelper.Instance.ScriptHtml(CutContent, "span", 3);
                                }
                                else if (str == "a")
                                {
                                    CutContent = CollectionHelper.Instance.ScriptHtml(CutContent, "a", 3);
                                }
                            }
                        }
                        #endregion
                        #region 排除字符
                        if (!string.IsNullOrEmpty(m.LabelRemove))
                        {
                            foreach (string str in m.LabelRemove.Split(new string[] { "$$$$" }, StringSplitOptions.RemoveEmptyEntries))
                            {
                                CutContent = CutContent.Replace(str, "");
                            }
                        }
                        #endregion
                        #region 替换字符
                        if (!string.IsNullOrEmpty(m.LabelReplace))
                        {
                            foreach (string str in m.LabelReplace.Split(new string[] { "$$$$" }, StringSplitOptions.RemoveEmptyEntries))
                            {
                                string[] ListStr = str.Split(new string[] { "||" }, StringSplitOptions.RemoveEmptyEntries);
                                CutContent = CutContent.Replace(ListStr[0], ListStr[1]);
                            }
                        }
                        #endregion
                        sb1.Append("" + m.LabelName.Replace("'", "''") + ",");
                        sb2.Append("'" + CutContent.Replace("'", "''") + "',");
                        if (CutContent.Replace("'", "''").Length < 100)
                        {
                            sb3.Append(" " + m.LabelName.Replace("'", "''") + "='" + CutContent.Replace("'", "''") + "' and");
                        }
                        //添加文件下载功能  开关打开的时候
                        if (m.IsDownResource == 1)
                        {
                            string[] imgExtArr = m.DownResourceExts.Split(new string[] { ";" }, StringSplitOptions.None);
                            foreach (string s in imgExtArr)
                            {
                            }
                            string downImgPath = AppDomain.CurrentDomain.BaseDirectory + "Data\\Collection\\" + modelTask.TaskName + "\\Images\\";
                            CutContent = ImageDownHelper.SaveUrlPics(CutContent, downImgPath);
                        }
                    }

                    string LocalSQLiteName = "Data\\Collection\\" + modelTask.TaskName + "\\SpiderResult.db";
                    string sql             = " Select Count(1) From Content Where HrefSource='" + url + "' ";
                    object o = SQLiteHelper.ExecuteScalar(LocalSQLiteName, sql);
                    if (Convert.ToInt32("0" + o) == 0)
                    {
                        strSql.Append("insert into Content(HrefSource,");
                        strSql.Append(sb1.ToString().Remove(sb1.Length - 1));
                        strSql.Append(")");
                        strSql.Append(" values ('" + url + "',");
                        strSql.Append(sb2.ToString().Remove(sb2.Length - 1));
                        strSql.Append(")");

                        SQLiteHelper.Execute(LocalSQLiteName, strSql.ToString());
                    }



                    title            = title.Replace('\\', ' ').Replace('/', ' ').Split(new char[] { '_' })[0].Split(new char[] { '-' })[0];
                    gatherEv.Message = mlink.Url + "=" + title;
                    GatherWorkDelegate(this, gatherEv);
                }
                else
                {
                    gatherEv.Message = "没有采集到任何地址!不需要采集!";
                    GatherWorkDelegate(this, gatherEv);
                }
                //暂停
                var r        = new Random();
                var stepNext = r.Next(1, 4);
                Thread.Sleep(stepNext * 2000);
            }
        }