/// <summary> /// 判断爬取结果是否可用 /// </summary> /// <param name="input">输入的封装</param> /// <returns></returns> private bool IfHtmlPackerIsWanted(HtmlPacker input) { //null值检查 if (null == input || string.IsNullOrEmpty(input.NowTitle)) { return(false); } //null关键字不检查 else if (string.IsNullOrEmpty(m_strTitlKeyWord)) { return(true); } else { return(Regex.IsMatch(input.NowTitle, m_strTitlKeyWord)); } }
/// <summary> /// 执行一次爬虫操作 /// </summary> private void DoOneSpiderWork() { KeyValuePair <string, int> nowUseUrl = new KeyValuePair <string, int>(); HtmlPacker tempHtmlPacker = null; //若有可被使用的数据 if (m_useQueue.UnUseSize > 0) { //从队列中获取结果 nowUseUrl = m_useQueue.Pop(); //获取一个Html封装 tempHtmlPacker = m_useRequestManger.GetHtmlPacker (nowUseUrl.Key, nowUseUrl.Value); //获得下层深度 int nextDeep = nowUseUrl.Value + 1; //判断结果是否需要保存 if (IfHtmlPackerIsWanted(tempHtmlPacker)) { LstResult.Add(tempHtmlPacker); } //若成功获取封装且在合理深度 if (null != tempHtmlPacker && nextDeep <= m_nMaxDeep) { foreach (var oneSubUrl in tempHtmlPacker.SubUrls) { //若地址可用 if (true == IfUrlCanUse(oneSubUrl)) { if (m_useQueue.UnUseSize <= m_nMaxCount) { //压入队列 m_useQueue.Push(oneSubUrl, nextDeep); } } } } } }