/// <summary> /// 在派生类中重写此方法,以便在完成对一个Content处理之后进行处理 /// </summary> /// <param name="content">Content</param> /// <param name="context">ProcessContext</param> protected virtual void OnProcessCompleted(Content content, ProcessContext context) { // }
/// <summary> /// 处理内容 /// </summary> /// <param name="content">Content实例</param> private void DoProcess(Content content) { if (null == this.settings.ContentHandlers && null == content.RawUrl.ContentHandlers) { return; } if (null == content) { return; } //排序处理程序集合 List<IContentHandler> handlers = new List<IContentHandler>(); foreach (IContentHandler h in this.settings.ContentHandlers) { if (null != h) { handlers.Add(h); } } foreach (IContentHandler h in content.RawUrl.ContentHandlers) { if (null != h) { handlers.Add(h); } } handlers.Sort(new ContentHandlerPriorityCompare()); ProcessContext context = new ProcessContext(this); using (content) { //按照优先次序调用内容处理程序 foreach (IContentHandler handler in handlers) { handler.Process(content, context); } //处理完成后操作 this.OnProcessCompleted(content, context); } Thread.Sleep((int)this.settings.SpeedMode); }
/// <summary> /// 在派生类中重写此方法,以便在排队一个Content之前进行处理 /// </summary> /// <param name="content">Content</param> protected virtual void BeforeEnqueueContent(Content content) { // }
/// <summary> /// 在派生类中重写此方法,以便在Content下载完成进行处理 /// </summary> /// <param name="content"></param> protected virtual void OnContentLoaded(Content content) { // }
/// <summary> /// 在派生类中重写此方法,从给定的内容中查找URL字符串 /// </summary> /// <param name="content">Content</param> /// <returns>IEnumerable(string)</returns> protected abstract IEnumerable<KeyValuePair<string, string>> FindUrls(Content content);
/// <summary> /// 手动排队Content /// </summary> /// <param name="content">Content</param> public void ManualQueue(Content content) { if (null == content) { return; } this.contentQueue.Add(content); }
public void Process(Content content, ProcessContext context) { if (!(content is TextContent)) { return; } content.SaveToFile(@"E:\SOUEX\tmp\" + content.RawUrl.GetRawParam("id")+".txt"); Console.WriteLine("提取Email From: {0}", content.RawUrl.GetUrl()); Match m = mReg.Match(((TextContent)content).Content); if (null != m && m.Success) { string sql = String.Format("UPDATE users SET email='{0}' WHERE uid='{1}'", m.Value, content.RawUrl.GetRawParam("id")); Console.WriteLine(sql); db.ExecuteNonQuery(sql); } }
/// <summary> /// 在派生类中重写此方法,实现URL的抽取 /// </summary> /// <param name="content">Content</param> /// <returns>IList</returns> public IDictionary<uint, Url> Extract(Content content) { IEnumerable<KeyValuePair<string, string>> strUrls = this.FindUrls(content); IDictionary<uint, Url> urls = new Dictionary<uint, Url>(); if (null == strUrls) { return urls; } Url holder = content.RawUrl; Url url; UrlRuler matchedRuler; //遍历抽取到的URL using (IEnumerator<KeyValuePair<string, string>> enm = strUrls.GetEnumerator()) { while (enm.MoveNext()) { matchedRuler = null; //循环使用当前抽取器的URL规则来匹配当前的URL字符串,匹配成功则跳出循环,进一步处理 foreach (UrlRuler r in this.rulers) { if (r.IsMatch(enm.Current.Key)) { matchedRuler = r; break; } } //进一步处理匹配到的URL if (null != matchedRuler) { url = null; try { switch (matchedRuler.UrlType) { case UrlTypes.Index: if (Utils.IsAbsoluteUrlString(enm.Current.Key)) { url = Url.CreateIndexUrl(enm.Current.Key); } else { url = Url.CreateIndexUrl(content.RawUrl.Uri.AbsoluteUri, enm.Current.Key); } break; case UrlTypes.Final: if (Utils.IsAbsoluteUrlString(enm.Current.Key)) { url = Url.CreateFinalUrl(enm.Current.Key, holder); } else { url = Url.CreateFinalUrl(content.RawUrl.Uri.AbsoluteUri, enm.Current.Key, holder); } break; } } catch (Exception e1) { Console.WriteLine("无法创建URL,错误:{0}, URL字符串:{1}", e1, enm.Current.Key); continue; } //当URL无效,或者存在相同主机部分限制时 if (null == url || !url.IsValid || (this.OnlySameHost && !Url.IsSameHost(content.RawUrl, url)) || (this.OnlySameDomain && !Url.IsSameDomain(content.RawUrl, url))) { continue; } url.Text = enm.Current.Value; //将匹配规则的属性赋予匹配的URL url.HttpMethod = matchedRuler.HttpMethod; url.AppendParams = matchedRuler.AppendParmas; foreach (IContentHandler h in matchedRuler.ContentHandlers) { url.ContentHandlers.Add(h); } if (!urls.ContainsKey(url.CheckSum)) { urls.Add(url.CheckSum, url); } } } } return urls; }
public void Process(Content content, ProcessContext context) { if (!(content is TextContent)) { return; } //content.SaveToFile(Utils.GetAppPath() +"DATA\\"+ content.RawUrl.CheckSum+".txt"); StringDictionary list = new StringDictionary(); string uid, uname; MatchCollection matches = mReg.Matches(((TextContent)content).Content); foreach (Match m in matches) { if (!m.Success) { continue; } uid = m.Groups["ID"].Value; uname = m.Groups["NAME"].Value; if (!String.IsNullOrEmpty(uid) && !list.ContainsKey(uid) && !String.IsNullOrEmpty(uname)) { list.Add(uid, uname); Console.WriteLine("ID={0},Name='{1}'", uid, uname); } } if (list.Count == 0) { return; } foreach (string k in list.Keys) { string sql = String.Format("select COUNT(`uid`) from users where uid={0:d}", k); int n = Convert.ToInt32(db.ExecuteScalar(sql)); if (n > 0) { continue; } sql = String.Format("INSERT INTO users (uid, uname, email)VALUES('{0:d}', '{1}', '');", k, list[k]); db.ExecuteNonQuery(sql); } }
/// <summary> /// 在派生类中重写此方法,对内容进行处理 /// </summary> /// <param name="content">Content实例</param> /// <param name="context">ProcessContext上下文信息</param> public abstract void Process(Content content, ProcessContext context);
/// <summary> /// 从给定内容中提取URL字符串,返回一个IEnumerable泛型集合,元素为string /// </summary> /// <param name="content">Content实例</param> /// <returns>IEnumerable</returns> protected override IEnumerable<KeyValuePair<string, string>> FindUrls(Content content) { if (null == content || content.GetType() != typeof(TextContent)) { return null; } List<KeyValuePair<string, string>> urls = new List<KeyValuePair<string, string>>(); string url, text; if (null != this.regexA) { foreach (Match m in this.regexA.Matches(((TextContent)content).Content)) { url = m.Groups["URL"].Value; text = m.Groups["TEXT"].Value; if (null != url && "" != url) { urls.Add(new KeyValuePair<string, string>(url, text)); } } } if (null != this.regexImg) { foreach (Match m in this.regexImg.Matches(((TextContent)content).Content)) { url = m.Groups["SRC"].Value; text = ""; Match match = UrlExtractor.ImageAltRegex.Match(m.Value); if (match.Success) { text = match.Groups["ALT"].Value; } if (null != url && "" != url) { urls.Add(new KeyValuePair<string, string>(url, text)); } } } return urls; }
protected ContentBase(Content content) :this() { this.Content = content; }
private ContentBase() { this.Content = null; }