Пример #1
0
        /// <summary>
        /// 提取采集规则返回一个单元格
        /// </summary>
        /// <param name="rule">采集规则</param>
        /// <returns>采集结果</returns>
        private string ExtractionColumn(ExtractionRule r)
        {
            string result = _htmlText;

            //采集时间作为结果
            if (r.TimeAsResult)
            {
                return(DateTime.Now.ToString());
            }

            //将固定值最为结果
            if (r.ConstantAsResult && !r.Static)
            {
                return(r.ConstantValue);
            }
            //记录当前网址
            if (r.UrlAsResult)
            {
                return(_response.ResponseUri.AbsoluteUri);
            }
            //响应头作为结果
            if (r.ResponseHeaderAsResult)
            {
                return(_response.Headers[r.ResponseHeaderName]);
            }
            //if(extractionRule.PostParametersAsResult) return this.HttpHelper.WebRequest.GetRequestStream();   //POST参数作为结果
            //if (extractionRule.LinkTextAsResult) return ""; //链接文本作为结果

            //截取内容
            result = Smart.Utility.StringHelper.SubString(result, r.PreviousFlag, r.FollowingFlag);

            //使用正则表达式采集结果
            if (r.Static)
            {
                //如果静态规则选中,则固定值值作为结果为正则表达式。
                MatchCollection coll = Regex.Matches(result, r.ConstantValue);
                result = string.Empty;
                foreach (Match m in coll)
                {
                    result += m.Value;
                }
            }

            //采集结果替换
            result = ResultReplace(result, r.Replacements);

            //过滤Html标记
            result = FilterHtmlMark(result, r.ReservedHtmlMarks);

            //过滤掉无效字符:空格、换行符、制表符
            result = result.Replace(" ", "");
            result = result.Replace("\t", "");
            result = result.Replace("\n", "");
            result = result.Replace("\r", "");
            result = result.Trim();

            return(result);
        }
Пример #2
0
        /// <summary>
        /// 提取采集规则返回一个单元格
        /// </summary>
        /// <param name="rule">采集规则</param>
        /// <returns>采集结果</returns>
        private string ExtractionColumn(ExtractionRule r) {
            string result = _htmlText;

            //采集时间作为结果
            if (r.TimeAsResult) {
                return DateTime.Now.ToString();
            }

            //将固定值最为结果
            if (r.ConstantAsResult && !r.Static) {
                return r.ConstantValue;
            }
            //记录当前网址
            if (r.UrlAsResult) {
                return _response.ResponseUri.AbsoluteUri;
            }
            //响应头作为结果
            if (r.ResponseHeaderAsResult) {
                return _response.Headers[r.ResponseHeaderName];
            }
            //if(extractionRule.PostParametersAsResult) return this.HttpHelper.WebRequest.GetRequestStream();   //POST参数作为结果
            //if (extractionRule.LinkTextAsResult) return ""; //链接文本作为结果

            //截取内容
            result = Smart.Utility.StringHelper.SubString(result, r.PreviousFlag, r.FollowingFlag);

            //使用正则表达式采集结果
            if (r.Static) {
                //如果静态规则选中,则固定值值作为结果为正则表达式。
                MatchCollection coll = Regex.Matches(result, r.ConstantValue);
                result = string.Empty;
                foreach (Match m in coll) {
                    result += m.Value;
                }
            }

            //采集结果替换
            result = ResultReplace(result, r.Replacements);

            //过滤Html标记
            result = FilterHtmlMark(result, r.ReservedHtmlMarks);

            //过滤掉无效字符:空格、换行符、制表符
            result = result.Replace(" ", "");
            result = result.Replace("\t", "");
            result = result.Replace("\n", "");
            result = result.Replace("\r", "");
            result = result.Trim();

            return result;
        }