/// <summary>
 /// 在派生类中重写此方法,以便在完成对一个Content处理之后进行处理
 /// </summary>
 /// <param name="content">Content</param>
 /// <param name="context">ProcessContext</param>
 protected virtual void OnProcessCompleted(Content content, ProcessContext context)
 {
     //
 }
        /// <summary>
        /// 处理内容
        /// </summary>
        /// <param name="content">Content实例</param>
        private void DoProcess(Content content)
        {
            if (null == this.settings.ContentHandlers && null == content.RawUrl.ContentHandlers)
            {
                return;
            }

            if (null == content)
            {
                return;
            }

            //排序处理程序集合
            List<IContentHandler> handlers = new List<IContentHandler>();
            foreach (IContentHandler h in this.settings.ContentHandlers)
            {
                if (null != h)
                {
                    handlers.Add(h);
                }
            }

            foreach (IContentHandler h in content.RawUrl.ContentHandlers)
            {
                if (null != h)
                {
                    handlers.Add(h);
                }
            }

            handlers.Sort(new ContentHandlerPriorityCompare());

            ProcessContext context = new ProcessContext(this);

            using (content)
            {
                //按照优先次序调用内容处理程序
                foreach (IContentHandler handler in handlers)
                {
                    handler.Process(content, context);
                }

                //处理完成后操作
                this.OnProcessCompleted(content, context);
            }

            Thread.Sleep((int)this.settings.SpeedMode);
        }
 /// <summary>
 /// 在派生类中重写此方法,以便在排队一个Content之前进行处理
 /// </summary>
 /// <param name="content">Content</param>
 protected virtual void BeforeEnqueueContent(Content content)
 {
     //
 }
 /// <summary>
 /// 在派生类中重写此方法,以便在Content下载完成进行处理
 /// </summary>
 /// <param name="content"></param>
 protected virtual void OnContentLoaded(Content content)
 {
     //
 }
 /// <summary>
 /// 在派生类中重写此方法,从给定的内容中查找URL字符串
 /// </summary>
 /// <param name="content">Content</param>
 /// <returns>IEnumerable(string)</returns>
 protected abstract IEnumerable<KeyValuePair<string, string>> FindUrls(Content content);
 /// <summary>
 /// 手动排队Content
 /// </summary>
 /// <param name="content">Content</param>
 public void ManualQueue(Content content)
 {
     if (null == content)
     {
         return;
     }
     this.contentQueue.Add(content);
 }
Exemple #7
0
        public void Process(Content content, ProcessContext context)
        {
            if (!(content is TextContent))
            {
                return;
            }

            content.SaveToFile(@"E:\SOUEX\tmp\" + content.RawUrl.GetRawParam("id")+".txt");

            Console.WriteLine("提取Email From: {0}", content.RawUrl.GetUrl());
            Match m = mReg.Match(((TextContent)content).Content);

            if (null != m && m.Success)
            {
                string sql = String.Format("UPDATE users SET email='{0}' WHERE uid='{1}'", m.Value, content.RawUrl.GetRawParam("id"));
                Console.WriteLine(sql);
                db.ExecuteNonQuery(sql);
            }
        }
        /// <summary>
        /// 在派生类中重写此方法,实现URL的抽取
        /// </summary>
        /// <param name="content">Content</param>
        /// <returns>IList</returns>
        public IDictionary<uint, Url> Extract(Content content)
        {
            IEnumerable<KeyValuePair<string, string>> strUrls = this.FindUrls(content);
            IDictionary<uint, Url> urls = new Dictionary<uint, Url>();

            if (null == strUrls)
            {
                return urls;
            }

            Url holder = content.RawUrl;
            Url url;
            UrlRuler matchedRuler;

            //遍历抽取到的URL
            using (IEnumerator<KeyValuePair<string, string>> enm = strUrls.GetEnumerator())
            {
                while (enm.MoveNext())
                {
                    matchedRuler = null;

                    //循环使用当前抽取器的URL规则来匹配当前的URL字符串,匹配成功则跳出循环,进一步处理
                    foreach (UrlRuler r in this.rulers)
                    {
                        if (r.IsMatch(enm.Current.Key))
                        {
                            matchedRuler = r;
                            break;
                        }
                    }

                    //进一步处理匹配到的URL
                    if (null != matchedRuler)
                    {
                        url = null;
                        try
                        {
                            switch (matchedRuler.UrlType)
                            {
                                case UrlTypes.Index:
                                    if (Utils.IsAbsoluteUrlString(enm.Current.Key))
                                    {
                                        url = Url.CreateIndexUrl(enm.Current.Key);
                                    }
                                    else
                                    {
                                        url = Url.CreateIndexUrl(content.RawUrl.Uri.AbsoluteUri, enm.Current.Key);
                                    }
                                    break;

                                case UrlTypes.Final:
                                    if (Utils.IsAbsoluteUrlString(enm.Current.Key))
                                    {
                                        url = Url.CreateFinalUrl(enm.Current.Key, holder);
                                    }
                                    else
                                    {
                                        url = Url.CreateFinalUrl(content.RawUrl.Uri.AbsoluteUri, enm.Current.Key, holder);
                                    }
                                    break;
                            }
                        }
                        catch (Exception e1)
                        {
                            Console.WriteLine("无法创建URL,错误:{0}, URL字符串:{1}", e1, enm.Current.Key);
                            continue;
                        }

                        //当URL无效,或者存在相同主机部分限制时
                        if (null == url || !url.IsValid || (this.OnlySameHost && !Url.IsSameHost(content.RawUrl, url)) || (this.OnlySameDomain && !Url.IsSameDomain(content.RawUrl, url)))
                        {
                            continue;
                        }

                        url.Text = enm.Current.Value;

                        //将匹配规则的属性赋予匹配的URL
                        url.HttpMethod = matchedRuler.HttpMethod;
                        url.AppendParams = matchedRuler.AppendParmas;
                        foreach (IContentHandler h in matchedRuler.ContentHandlers)
                        {
                            url.ContentHandlers.Add(h);
                        }

                        if (!urls.ContainsKey(url.CheckSum))
                        {
                            urls.Add(url.CheckSum, url);
                        }
                    }
                }
            }

            return urls;
        }
Exemple #9
0
        public void Process(Content content, ProcessContext context)
        {
            if (!(content is TextContent))
            {
                return;
            }

            //content.SaveToFile(Utils.GetAppPath() +"DATA\\"+ content.RawUrl.CheckSum+".txt");

            StringDictionary list = new StringDictionary();
            string uid, uname;

            MatchCollection matches = mReg.Matches(((TextContent)content).Content);
            foreach (Match m in matches)
            {
                if (!m.Success)
                {
                    continue;
                }
                uid = m.Groups["ID"].Value;
                uname = m.Groups["NAME"].Value;
                if (!String.IsNullOrEmpty(uid) && !list.ContainsKey(uid) && !String.IsNullOrEmpty(uname))
                {
                    list.Add(uid, uname);
                    Console.WriteLine("ID={0},Name='{1}'", uid, uname);
                }
            }

            if (list.Count == 0)
            {
                return;
            }

            foreach (string k in list.Keys)
            {
                string sql = String.Format("select COUNT(`uid`) from users where uid={0:d}", k);
                int n = Convert.ToInt32(db.ExecuteScalar(sql));
                if (n > 0)
                {
                    continue;
                }

                sql = String.Format("INSERT INTO users (uid, uname, email)VALUES('{0:d}', '{1}', '');", k, list[k]);
                db.ExecuteNonQuery(sql);
            }
        }
 /// <summary>
 /// 在派生类中重写此方法,对内容进行处理
 /// </summary>
 /// <param name="content">Content实例</param>
 /// <param name="context">ProcessContext上下文信息</param>
 public abstract void Process(Content content, ProcessContext context);
        /// <summary>
        /// 从给定内容中提取URL字符串,返回一个IEnumerable泛型集合,元素为string
        /// </summary>
        /// <param name="content">Content实例</param>
        /// <returns>IEnumerable</returns>
        protected override IEnumerable<KeyValuePair<string, string>> FindUrls(Content content)
        {
            if (null == content || content.GetType() != typeof(TextContent))
            {
                return null;
            }

            List<KeyValuePair<string, string>> urls = new List<KeyValuePair<string, string>>();
            string url, text;
            if (null != this.regexA)
            {
                foreach (Match m in this.regexA.Matches(((TextContent)content).Content))
                {
                    url = m.Groups["URL"].Value;
                    text = m.Groups["TEXT"].Value;
                    if (null != url && "" != url)
                    {
                        urls.Add(new KeyValuePair<string, string>(url, text));
                    }
                }
            }

            if (null != this.regexImg)
            {
                foreach (Match m in this.regexImg.Matches(((TextContent)content).Content))
                {
                    url = m.Groups["SRC"].Value;
                    text = "";
                    Match match = UrlExtractor.ImageAltRegex.Match(m.Value);
                    if (match.Success)
                    {
                        text = match.Groups["ALT"].Value;
                    }
                    if (null != url && "" != url)
                    {
                        urls.Add(new KeyValuePair<string, string>(url, text));
                    }
                }
            }
            return urls;
        }
 protected ContentBase(Content content)
     :this()
 {
     this.Content = content;
 }
 private ContentBase()
 {
     this.Content = null;
 }