Exemplo n.º 1
0
        /// <summary>
        /// 处理新闻链接
        /// 返回0:不操作超时
        /// 返回1:非kduri操作超时
        /// 返回2:kdurl操作超时
        /// </summary>
        /// <param name="oldUri"></param>
        internal static int ParseNewsUri(MyUri oldUri)
        {
            int result = 0;

            if (oldUri.AbsoluteUri.Equals(@"http://sports.sina.com.cn/g/laliga/2013-08-08/12216712337.shtml"))
            {
            }
            string    constructUri = string.Format(@"http://59.39.71.239:886/getnewscontent.aspx?password=Kcis123_AutoGetNewsContent&url={0}", oldUri.AbsoluteUri);
            MyUri     newUri       = new MyUri(constructUri);
            WebHelper webHelper    = new WebHelper();

            string oldResponseStr = webHelper.GetContent(oldUri);
            string newResponseStr = webHelper.GetContent(newUri);

            if (oldResponseStr.Equals("操作超时"))
            {
                result = 1;
            }
            if (newResponseStr.Equals("操作超时"))
            {
                result = 2;
            }
            GetNewsInfo(oldResponseStr, newResponseStr, oldUri);
            InsertNews();
            InsertVisited();
            return(result);
        }
Exemplo n.º 2
0
        /// <summary>
        /// 新闻的板块获取的跳转
        /// </summary>
        /// <param name="oldResponseStr"></param>
        /// <returns></returns>
        private static string GetNewsForumTrans(string responseStr, MyUri uri)
        {
            string forum = null;

            if (uri.AbsoluteUri.Contains("sina.com"))
            {
                forum = GetNewsForum(responseStr, 1);
            }
            else if (uri.AbsoluteUri.Contains("qq.com"))
            {
                forum = GetNewsForum(responseStr, 2);
            }
            else if (uri.AbsoluteUri.Contains("sohu.com"))
            {
                forum = GetNewsForum(responseStr, 3);
            }
            else if (uri.AbsoluteUri.Contains("163.com"))
            {
                forum = GetNewsForum(responseStr, 4);
            }
            else if (uri.AbsoluteUri.Contains("ifeng.com"))
            {
                forum = GetNewsForum(responseStr, 5);
            }
            return(forum);
        }
Exemplo n.º 3
0
        /// <summary>
        /// 判断链接是否有价值,包括是否已被访问过
        /// </summary>
        /// <param name="newUri"></param>
        /// <returns></returns>
        private bool Isvalueable(MyUri newUri)
        {
            bool        isNewsExist = false;
            bool        isValueable = false;
            MysqlHelper mysqlHelper = new MysqlHelper();

            isNewsExist = mysqlHelper.IsNewsExist(newUri.AbsoluteUri);
            if (!isNewsExist)
            {
                string[] ContainArray = { ".shtml", "html", "htm" };
                foreach (string contain in ContainArray)
                {
                    if (newUri.AbsoluteUri.EndsWith(contain) &&
                        (!newUri.AbsoluteUri.Contains(@"163.com/special")) &&
                        (!newUri.AbsoluteUri.Contains(@"http://comment.ifeng.com/")) &&
                        (!newUri.AbsoluteUri.Contains(@"http://comment2.news.sohu.com")) &&
                        (!newUri.AbsoluteUri.Contains(@"http://news.ifeng.com/photo")) &&
                        (!newUri.AbsoluteUri.Contains(@"http://slide.mil.news.sina.com.cn")) &&
                        (!newUri.AbsoluteUri.Contains(@"slide.news.sina")) &&

                        (!newUri.AbsoluteUri.Contains("video.sina")))
                    {
                        isValueable = true;
                        break;
                    }
                }
            }
            else
            {
                UpdateUri(newUri);
            }
            return(isValueable);
        }
Exemplo n.º 4
0
 /// <summary>
 /// 读取指定URL地址,获取内容
 /// </summary>
 public string GetContent(MyUri uri)
 {
     if (uri.AbsoluteUri == "http://comment.news.sohu.com/djpm/")
     {
         return(getHtml2(uri));
     }
     else
     {
         return(getHtml1(uri));
     }
 }
Exemplo n.º 5
0
        private MyUri GetNewUri(string strRef, MyUri uri)
        {
            if (strRef.IndexOf("..") != -1 || strRef.StartsWith("/") == true || strRef.StartsWith("http://") == false)
            {
                strRef = new Uri(uri, strRef).AbsoluteUri;
            }

            MyUri newUri = new MyUri(strRef);

            newUri.Depth = uri.Depth + 1;
            return(newUri);
        }
Exemplo n.º 6
0
        private static string getHtml2(MyUri uri)
        {
            string          read     = null;
            HttpWebResponse response = null;
            HttpWebRequest  wr       = null;
            WebProxy        proxy    = new WebProxy("proxy2.nfdaily.com:8080", false);

            proxy.Credentials = new NetworkCredential(@"zhangchi", "123789", "nfdaily");
            StringBuilder s     = new StringBuilder(102400);
            bool          isGet = true;
            int           time  = 0;

            do
            {
                try
                {
                    wr = (HttpWebRequest)WebRequest.Create(uri.AbsoluteUri);
                    //wr.Timeout = 5000;
                    Thread.Sleep(30);
                    wr.Headers[HttpRequestHeader.AcceptEncoding] = "gzip, deflate";
                    //wr.Proxy = proxy;
                    DateTime now = DateTime.Now;
                    response = (HttpWebResponse)wr.GetResponse();
                    TimeSpan t = DateTime.Now - now;
                    //log4net.ILog log = log4net.LogManager.GetLogger("MyLogger");
                    //log.Debug(string.Format("{0}请求响应时间--->{1}",uri.AbsoluteUri, t.ToString()));
                    //Console.WriteLine(t);
                    //wr.Abort();
                    time++;
                }
                catch (Exception ex)
                {
                    isGet = false;
                    log4net.ILog log = log4net.LogManager.GetLogger("MyLogger");
                    log.Debug(string.Format("错误信息{0}--->{1}", uri.AbsoluteUri, ex.Message));
                    read = ex.Message;
                }
            } while (isGet == false && time < 7);
            string[]     keys = response.Headers.AllKeys;
            GZipStream   g    = new GZipStream(response.GetResponseStream(), CompressionMode.Decompress);
            StreamReader test = new StreamReader(g, Encoding.GetEncoding(uri.Encoding));

            if (read != null)
            {
                read = test.ReadToEnd();
                read = Regex.Match(read, "<style>.*<style>", RegexOptions.Singleline).Value;
            }
            //if (response != null)
            //    response.Close();
            //if (wr != null)
            //    wr.Abort();
            return(read);
        }
Exemplo n.º 7
0
        /// <summary>
        /// 获取一个新的新闻链接
        /// </summary>
        /// <returns></returns>
        private MyUri DequeueNewsUri()
        {
            Monitor.Enter(_UriNewsQueue);
            MyUri newsUri = null;

            if (_UriNewsQueue.Count != 0)
            {
                newsUri = _UriNewsQueue.Dequeue();
                _lastDequeueNewsTime = DateTime.Now;
            }
            Monitor.Exit(_UriNewsQueue);
            return(newsUri);
        }
Exemplo n.º 8
0
        public static void GetHashCode_Some()
        {
            Assert.Equal(1.GetHashCode(), One.GetHashCode());
            Assert.Equal(2.GetHashCode(), Two.GetHashCode());
            Assert.Equal(2L.GetHashCode(), TwoL.GetHashCode());
#if !(NETSTANDARD2_0 || NETSTANDARD1_x || NETFRAMEWORK) // GetHashCode(StringComparison)
            Assert.Equal(MyText.GetHashCode(StringComparison.Ordinal), SomeText.GetHashCode());
#endif
            Assert.Equal(MyUri.GetHashCode(), SomeUri.GetHashCode());

            var anyT = AnyT.New();
            Assert.Equal(anyT.Value.GetHashCode(), anyT.Some.GetHashCode());
        }
Exemplo n.º 9
0
        /// <summary>
        /// 处理新闻链接的线程运行时执行的函数
        /// </summary>
        /// <param name="obj"></param>
        private void ThreadFunction_news(object obj)
        {
            while (true)
            {
                MyUri newsUri = DequeueNewsUri();
                if (newsUri != null)
                {
                    NewsHelper newsHelper = new NewsHelper();
                    int        result     = NewsHelper.ParseNewsUri(newsUri);
                    _allKdUrls += 1;
                    if (result == 1)
                    {
                        _timeOutNormalUrl += 1;
                    }
                    else if (result == 2)
                    {
                        _timeOutKdUrl += 1;
                    }
                }
                else
                if (isFinished())
                {
                    AbordNewsThreads();
                    break;
                }
                else
                {
                    Thread.Sleep(1 * 1000);
                }
            }
            int threadLiving = 0;

            foreach (var thread in _threadsNews)
            {
                if (thread.ThreadState == ThreadState.Aborted || thread.ThreadState == ThreadState.Stopped)
                {
                    threadLiving++;
                }
            }
            if (threadLiving <= 1)
            {
                log4net.ILog log = log4net.LogManager.GetLogger("MyLogger");
                log.Debug(string.Format("结束采集{0}", DateTime.Now));
                log.Debug(string.Format("全部链接数{0}", _allKdUrls));
                log.Debug(string.Format("全部链接数{0}", _allKdUrls));
                log.Debug(string.Format("kd超时链接数{0}", _timeOutKdUrl));
                log.Debug(string.Format("门户超时链接数{0}", _timeOutNormalUrl));
            }
        }
Exemplo n.º 10
0
        /// <summary>
        /// 从种子队列中出链接
        /// </summary>
        /// <returns></returns>
        private MyUri DequeueSeedUri()
        {
            MyUri uri = null;

            Monitor.Enter(_UriSeedQueue);
            try
            {
                uri = (MyUri)_UriSeedQueue.Dequeue();
            }
            catch (Exception)
            {
            }
            Monitor.Exit(_UriSeedQueue);
            return(uri);
        }
Exemplo n.º 11
0
        /// <summary>
        /// 查找所有的种子
        /// </summary>
        internal List <MyUri> SelectAllSeeds()
        {
            List <MyUri>    seeds           = new List <MyUri>();
            string          selectSeeds_sql = string.Format(@"select * 
                            from info_seed ");
            DataSet         testDataSet     = null;
            MySqlConnection conn            = new MySqlConnection(connStr);

            try
            {
                conn.Open();
                // 创建一个适配器
                MySqlDataAdapter adapter = new MySqlDataAdapter(selectSeeds_sql, conn);
                // 创建DataSet,用于存储数据.
                testDataSet = new DataSet();
                // 执行查询,并将数据导入DataSet.
                adapter.Fill(testDataSet, "result_data");
            }
            // 关闭数据库连接.
            catch (Exception e)
            {
                log4net.ILog log = log4net.LogManager.GetLogger("MyLogger");
                log.Debug(e.Message);
            }
            finally
            {
                conn.Close();
            }
            if (testDataSet != null)
            {
                foreach (DataRow testRow in testDataSet.Tables["result_data"].Rows)
                {
                    string seedUrl = testRow["seed_url"].ToString();
                    int    id      = int.Parse(testRow["ID"].ToString());
                    if (seedUrl != "")
                    {
                        MyUri seed = new MyUri(seedUrl);
                        seed.Encoding = testRow["seed_encoding"].ToString();
                        seed.Id       = id;
                        seed.Name     = testRow["seed_name"].ToString();
                        seeds.Add(seed);
                    }
                }
            }
            return(seeds);
        }
Exemplo n.º 12
0
        private static string GetNewsTimeTrans(string responseStr, MyUri uri)
        {
            string time = null;

            if (uri.AbsoluteUri.Contains("163.com") || uri.AbsoluteUri.Contains("qq.com"))
            {
                time = GetNewsTime(responseStr, 1);
            }
            else if (uri.AbsoluteUri.Contains("sina.com"))
            {
                time = GetNewsTime(responseStr, 2);
            }
            else if (uri.AbsoluteUri.Contains("sohu.com") || uri.AbsoluteUri.Contains("ifeng.com"))
            {
                time = GetNewsTime(responseStr, 3);
            }
            return(time);
        }
Exemplo n.º 13
0
        /// <summary>
        /// 解析
        /// </summary>
        /// <param name="uri"></param>
        private void ParseSeedUri(MyUri seedUri)
        {
            WebHelper webHelper  = new WebHelper();
            string    contentStr = webHelper.GetContent(seedUri);

            if (contentStr == null)
            {
                Console.WriteLine("获取不到种子列表信息");
                log4net.ILog log = log4net.LogManager.GetLogger("MyLogger");
                log.Debug(string.Format("{0}--->{1}", "获取不到种子列表信息", seedUri.AbsoluteUri));
            }
            else
            {
                List <MyUri> newsUris = GetNewsUri(contentStr, seedUri);
                Console.WriteLine(seedUri.AbsoluteUri + "--->" + newsUris.Count + "--->" + DateTime.Now);
                log4net.ILog log = log4net.LogManager.GetLogger("MyLogger");
                log.Debug(seedUri.AbsoluteUri + "--->" + newsUris.Count + "--->" + DateTime.Now);
                EnqueueNewsUris(newsUris);
            }
        }
Exemplo n.º 14
0
        internal bool IsVisited(MyUri uri)
        {
            /// <summary>
            /// 判断帖子是否已经被访问过
            /// </summary>
            /// <param name="uri"></param>
            /// <returns></returns>
            int count = 0;

            string          sel_forum = string.Format(@"select * from info_visited where visited_url = '{0}'", uri.AbsoluteUri);
            MySqlConnection conn      = new MySqlConnection(connStr);

            try
            {
                // 创建一个适配器
                MySqlDataAdapter adapter = new MySqlDataAdapter(sel_forum, conn);
                // 创建DataSet,用于存储数据.
                DataSet testDataSet = new DataSet();
                // 执行查询,并将数据导入DataSet.
                adapter.Fill(testDataSet, "result_data");
                // 关闭数据库连接.
                conn.Close();
                if (testDataSet.Tables["result_data"].Rows.Count == 0)
                {
                    return(false);
                }
                else
                {
                    return(true);
                }
            }
            catch (Exception e)
            {
                log4net.ILog log = log4net.LogManager.GetLogger("MyLogger");
                log.Debug(e.Message);
            }
            finally {
                conn.Close();
            }
            return(false);
        }
Exemplo n.º 15
0
 /// <summary>
 /// 处理种子链接的线程运行时执行的函数
 /// </summary>
 /// <param name="obj"></param>
 private void ThreadFunction_seed(object obj)
 {
     while (true)
     {
         MyUri uri = DequeueSeedUri();
         if (uri != null)
         {
             Thread.Sleep(1 * 1000);
             ParseSeedUri(uri);
         }
         else
         if (IsSeedHandled())
         {
             AbordSeedThreads();
             break;
         }
         else
         {
             Thread.Sleep(1 * 1000);
         }
     }
 }
Exemplo n.º 16
0
        /// <summary>
        /// 获取新闻链接
        /// </summary>
        /// <param name="contentStr"></param>
        /// <returns></returns>
        private List <MyUri> GetNewsUri(string contentStr, MyUri uri)
        {
            string       content          = Regex.Replace(contentStr, @"<script(\s[^>]*?)?>[\s\S]*?</script>", "", RegexOptions.IgnoreCase);
            string       strRef           = @"(href|HREF|src|SRC)[ ]*=[ ]*[""'][^""'#>]+[""']";
            List <MyUri> matchcollections = new List <MyUri>();

            MatchCollection matches = new Regex(strRef).Matches(content);

            foreach (Match match in matches)
            {
                strRef = GetRef(match);
                MyUri newUri = GetNewUri(strRef, uri);
                newUri.IsVisited = IsVisited(newUri);
                if (!newUri.IsVisited && Isvalueable(newUri))
                {
                    newUri.Name     = uri.Name;
                    newUri.Encoding = uri.Encoding;
                    newUri.Fk_seed  = uri.Id;
                    matchcollections.Add(newUri);
                }
            }
            return(matchcollections);
        }
Exemplo n.º 17
0
        /// <summary>
        /// 更新新闻纪录的插入时间
        /// </summary>
        /// <param name="newUri"></param>
        private void UpdateUri(MyUri newUri)
        {
            MysqlHelper mysqlHelper = new MysqlHelper();

            mysqlHelper.UpdateInsertTime(newUri.AbsoluteUri);
        }
Exemplo n.º 18
0
 private void ParseFolder(string folderName, int nDepth)
 {
     DirectoryInfo info = new DirectoryInfo(folderName);
     FileInfo[] files = info.GetFiles("*.txt");
     foreach (FileInfo info2 in files)
     {
         if (!this.ThreadsRunning)
         {
             break;
         }
         MyUri uri = new MyUri(info2.FullName) {
             Depth = nDepth
         };
         this.EnqueueUri(uri, true);
     }
     DirectoryInfo[] directories = info.GetDirectories();
     foreach (DirectoryInfo info3 in directories)
     {
         if (!this.ThreadsRunning)
         {
             break;
         }
         this.ParseFolder(info3.FullName, nDepth + 1);
     }
 }
Exemplo n.º 19
0
        private static string getHtml1(MyUri uri)
        {
            string          tempCode = null;
            string          error    = null;
            int             time     = 0;
            HttpWebResponse response = null;
            HttpWebRequest  request  = null;

            do                                                               //访问失败时重新访问,最多重新访问4次
            {
                WebProxy proxy = new WebProxy("proxy2.nfdaily.com:8080", false);
                proxy.Credentials = new NetworkCredential(@"zhangchi", "123789", "nfdaily");
                request           = HttpWebRequest.Create(uri.AbsoluteUri) as HttpWebRequest;
                Thread.Sleep(30);
                //request.Timeout = 5000;
                //request.Headers[HttpRequestHeader.AcceptEncoding] = "gzip,deflate";
                //request.AllowAutoRedirect = false;
                request.AllowAutoRedirect = true;
                //request.Proxy = proxy;
                int a = 0;
                time += 1;
                error = null;
                try
                {
                    DateTime now = DateTime.Now;
                    response = request.GetResponse() as HttpWebResponse;
                    TimeSpan t = DateTime.Now - now;
                    //log4net.ILog log = log4net.LogManager.GetLogger("MyLogger");
                    //log.Debug(string.Format("{0}请求响应时间--->{1}",uri.AbsoluteUri, t.ToString()));
                    //Console.WriteLine(t);
                    //request.Abort();
                }
                catch (Exception ex)
                {
                    log4net.ILog log = log4net.LogManager.GetLogger("MyLogger");
                    log.Debug(string.Format("错误信息{0}--->{1}", uri.AbsoluteUri, ex.Message));
                    tempCode = ex.Message;
                    //Console.WriteLine(ex.Message);
                    //error = ex.Message;
                    ////response = ex.Response as HttpWebResponse;
                    //Thread.Sleep(5 * 1000);
                }
            } while (error != null && time < 7);
            System.IO.Stream resStream = null;
            StreamReader     sr        = null;

            try
            {
                resStream = response.GetResponseStream();
                if (uri.AbsoluteUri.Contains("http://59.39.71.239:886/"))
                {
                    sr = new StreamReader(resStream, Encoding.UTF8);
                }
                else
                {
                    sr = new StreamReader(resStream, Encoding.GetEncoding(uri.Encoding));
                }
                tempCode = sr.ReadToEnd();
                response.Close();
                resStream.Close();
                sr.Close();
                if (uri.AbsoluteUri == "http://news.163.com/rank/")
                {
                    string pattern = @"<h2>全站</h2>.*<h2>科技</h2>";
                    tempCode = Regex.Match(tempCode, pattern, RegexOptions.Singleline).Value;
                }
                if (uri.AbsoluteUri == "http://news.qq.com/paihang.htm")
                {
                    tempCode = Regex.Match(tempCode, "<tbody>.*<tbody>", RegexOptions.Singleline).Value;
                }
                if (uri.AbsoluteUri == "http://news.ifeng.com/hotnews/")
                {
                    tempCode = Regex.Match(tempCode, "shtml.*<h4>资讯</h4>", RegexOptions.Singleline).Value;
                }
                if (uri.AbsoluteUri == "http://news.sina.com.cn/hotnews/")
                {
                    tempCode = Regex.Match(tempCode, "<!--seo内容输出开始-->.*<!--seo内容输出结束-->", RegexOptions.Singleline).Value;
                }
            }
            catch (Exception e)
            {
                log4net.ILog log = log4net.LogManager.GetLogger("MyLogger");
                log.Debug(string.Format("错误信息{0}--->{1}", uri.AbsoluteUri, e.Message));
            }
            //if (response != null)
            //    response.Close();
            //if (request != null)
            //    request.Abort();
            return(tempCode);
        }
Exemplo n.º 20
0
 private bool AddURL(ref MyUri uri)
 {
     foreach (string str in this.ExcludeHosts)
     {
         if ((str.Trim().Length > 0) && (uri.Host.ToLower().IndexOf(str.Trim()) != -1))
         {
             this.LogError(uri.AbsoluteUri, "\r\nHost excluded as it includes reserved pattern (" + str + ")");
             return false;
         }
     }
     Monitor.Enter(this.urlStorage);
     bool flag = false;
     try
     {
         string absoluteUri = uri.AbsoluteUri;
         flag = this.urlStorage.Add(ref absoluteUri).Count == 1;
     }
     catch (Exception)
     {
     }
     Monitor.Exit(this.urlStorage);
     return flag;
 }
Exemplo n.º 21
0
 /// <summary>
 /// 获取新闻的信息
 /// </summary>
 /// <param name="newsContentStr"></param>
 private static void GetNewsInfo(string oldResponseStr, string newResponseStr, MyUri uri)
 {
     _news.Source       = uri.Name;
     _news.Fk_seed_news = uri.Fk_seed;
     if (_news.Fk_seed_news == null || _news.Fk_seed_news == 0)
     {
         Console.WriteLine("{0}的外键---〉{1}", uri.AbsoluteUri, uri.Fk_seed);
     }
     _news.Url = uri.AbsoluteUri;
     if (_news.Url == null || _news.Url == "")
     {
         Console.WriteLine("{0}的链接", uri.AbsoluteUri);
     }
     if (newResponseStr != null && newResponseStr != "" && oldResponseStr != null && oldResponseStr != "")
     {
         _news.Content = GetNewsContent(newResponseStr);
         _news.Title   = GetNewsTitle(newResponseStr);
         //_news.Summary = GetNewsSummary(oldResponseStr);
         //_news.Source = GetNewsSource(oldResponseStr);                 //还没实现
         _news.Forum = GetNewsForumTrans(oldResponseStr, uri);
         _news.Time  = GetNewsTimeTrans(oldResponseStr, uri);
     }
 }
Exemplo n.º 22
0
 private void RunParser()
 {
     this.ThreadsRunning = true;
     try
     {
         long num = long.Parse(Global.COLUMN_ID.ToString());
         OleDbConnection connection = new OleDbConnection(CONN_ACCESS.ConnString);
         OleDbCommand command = new OleDbCommand("SELECT * FROM Web_URL where id=" + num, connection);
         connection.Open();
         OleDbDataReader reader = command.ExecuteReader(CommandBehavior.CloseConnection);
         string path = "";
         while (reader.Read())
         {
             path = reader["web_url"].ToString().Replace("\r", "");
             Global.BM = reader["bm"].ToString();
             Global.BXBH = reader["bxbh"].ToString();
             Global.BDBH = reader["bdbh"].ToString();
             Global.WEB_COLUMN_ID = reader["id"].ToString();
             Global.WEB_COLUMN_NAME = reader["column_name"].ToString();
             this.page_py = reader["column_name"].ToString();
             Global.WEB_PARENT_ID = reader["parentid"].ToString();
             Global.WEB_CODE = reader["code"].ToString();
             Global.URL_BIANHAO = reader["id"].ToString();
             this.zl_yes = reader["class"].ToString();
             Global.ONEURL = path;
             this.urllhost = path.ToString().Split(new char[] { '.' });
             if (Directory.Exists(path))
             {
                 this.ParseFolder(path, 0);
             }
             else
             {
                 if (!System.IO.File.Exists(path))
                 {
                     this.Normalize(ref path);
                     this.comboBoxWeb.Text = path;
                 }
                 MyUri uri = new MyUri(path);
                 this.EnqueueUri(uri, false);
             }
         }
     }
     catch (Exception exception)
     {
         this.LogError(exception.Message, exception.Message);
         return;
     }
     this.toolBarButtonContinue.Enabled = false;
 }
Exemplo n.º 23
0
 private bool IsVisited(MyUri newUri)
 {
     return(VisitedHelper.IsVisited(newUri));
 }
Exemplo n.º 24
0
 public void ShowUri(string uri)
 {
     MyUri.ShowUri(uri);
 }
Exemplo n.º 25
0
 private bool EnqueueUri(MyUri uri, bool bCheckRepetition)
 {
     if (!(!bCheckRepetition || this.AddURL(ref uri)))
     {
         return false;
     }
     Monitor.Enter(this.queueURLS);
     try
     {
         this.queueURLS.Enqueue(uri);
     }
     catch (Exception)
     {
     }
     Monitor.Exit(this.queueURLS);
     return true;
 }
Exemplo n.º 26
0
 private void ParseUri(MyUri uri, ref MyWebRequest request)
 {
     string str = "";
     if ((request != null) && request.response.KeepAlive)
     {
         str = str + "连接转至: " + uri.Host + "\r\n\r\n";
     }
     else
     {
         str = str + "连接: " + uri.Host + "\r\n\r\n";
     }
     ListViewItem item = null;
     Monitor.Enter(this.listViewThreads);
     try
     {
         item = this.listViewThreads.Items[int.Parse(Thread.CurrentThread.Name)];
         item.SubItems[1].Text = uri.Depth.ToString();
         item.ImageIndex = 1;
         item.BackColor = System.Drawing.Color.WhiteSmoke;
         item.SubItems[2].Text = "正在连接";
         item.ForeColor = System.Drawing.Color.Red;
         item.SubItems[3].Text = uri.AbsoluteUri;
         item.SubItems[4].Text = "";
         item.SubItems[5].Text = "";
     }
     catch (Exception)
     {
     }
     Monitor.Exit(this.listViewThreads);
     try
     {
         object obj2;
         request = MyWebRequest.Create(uri, request, this.KeepAlive);
         request.Timeout = this.RequestTimeout * 0x3e8;
         MyWebResponse response = request.GetResponse();
         str = str + request.Header + response.Header;
         if (!response.ResponseUri.Equals(uri))
         {
             this.EnqueueUri(new MyUri(response.ResponseUri.AbsoluteUri), true);
             obj2 = str;
             str = string.Concat(new object[] { obj2, "重定向到: ", response.ResponseUri, "\r\n" });
             request = null;
         }
         else
         {
             if ((!this.AllMIMETypes && (response.ContentType != null)) && (this.MIMETypes.Length > 0))
             {
                 string str2 = response.ContentType.ToLower();
                 int index = str2.IndexOf(';');
                 if (index != -1)
                 {
                     str2 = str2.Substring(0, index);
                 }
                 if ((str2.IndexOf('*') == -1) && ((index = this.MIMETypes.IndexOf(str2)) == -1))
                 {
                     this.LogError(uri.AbsoluteUri, str + "\r\nUnlisted Content-Type (" + str2 + "), check settings.");
                     request = null;
                     return;
                 }
                 Match match = new Regex(@"\d+").Match(this.MIMETypes, index);
                 int num3 = int.Parse(match.Value) * 0x400;
                 int num4 = int.Parse(match.NextMatch().Value) * 0x400;
                 if ((num3 < num4) && ((response.ContentLength < num3) || (response.ContentLength > num4)))
                 {
                     this.LogError(uri.AbsoluteUri, string.Concat(new object[] { str, "\r\nContentLength limit error (", response.ContentLength, ")" }));
                     request = null;
                     return;
                 }
             }
             string[] strArray = new string[] { ".gif", ".jpg", ".css", ".zip", ".exe" };
             bool flag = true;
             foreach (string str3 in strArray)
             {
                 if (uri.AbsoluteUri.ToLower().EndsWith(str3))
                 {
                     flag = false;
                     break;
                 }
             }
             foreach (string str3 in this.ExcludeFiles)
             {
                 if ((str3.Trim().Length > 0) && uri.AbsoluteUri.ToLower().EndsWith(str3))
                 {
                     flag = false;
                     break;
                 }
             }
             string strBody = uri.ToString();
             if (this.Compared(uri.LocalPath.Substring(uri.LocalPath.LastIndexOf('.') + 1).ToLower()) && (uri.ToString().Substring(uri.ToString().Length - 1, 1) != "/"))
             {
                 this.LogError("丢弃--非网页文件", strBody);
             }
             else
             {
                 int num5;
                 UriKind absolute = UriKind.Absolute;
                 if (!string.IsNullOrEmpty(strBody) && Uri.IsWellFormedUriString(strBody, absolute))
                 {
                     string page = GetPage(strBody);
                     Stopwatch stopwatch = new Stopwatch();
                     stopwatch.Start();
                     Html html = new Html {
                         Web = page,
                         Url = strBody
                     };
                     CommonAnalyze analyze = new CommonAnalyze();
                     analyze.LoadHtml(html);
                     Net.LikeShow.ContentAnalyze.Document result = analyze.GetResult();
                     stopwatch.Stop();
                     string bt = result.Title.Replace("[(title)]", "");
                     switch (bt)
                     {
                         case null:
                         case "":
                             bt = result.Doc.Substring(20).ToString();
                             break;
                     }
                     if ((result.Doc == null) || (result.Doc == ""))
                     {
                         this.LogError("丢弃--空内容或非内空页", strBody);
                     }
                     else
                     {
                         Lucene.Net.Documents.Document document3;
                         string str7 = result.Doc + bt;
                         if (this.cgcount >= 10)
                         {
                             string keywords = this.MD5string(result.Doc.ToString());
                             string keyWordsSplitBySpace = "";
                             IndexSearcher searcher = new IndexSearcher(this.path);
                             keyWordsSplitBySpace = GetKeyWordsSplitBySpace(keywords, new KTDictSegTokenizer());
                             Query query = new QueryParser("J_md5_bai", new KTDictSegAnalyzer(true)).Parse(keyWordsSplitBySpace);
                             if (searcher.Search(query).Doc(0).Get("J_md5_bai") == keywords)
                             {
                                 this.LogError("排除--重复", strBody);
                             }
                             else
                             {
                                 this.cgcount++;
                                 this.LogUri(bt, "引索完成");
                                 document3 = new Lucene.Net.Documents.Document();
                                 document3.Add(new Field("分类", this.page_py, Field.Store.YES, Field.Index.TOKENIZED));
                                 document3.Add(new Field("J_title_bai", bt, Field.Store.YES, Field.Index.TOKENIZED));
                                 document3.Add(new Field("J_msgContent_bai", str7, Field.Store.YES, Field.Index.TOKENIZED));
                                 document3.Add(new Field("J_SiteType_bai", result.SiteType.ToString(), Field.Store.YES, Field.Index.NO));
                                 document3.Add(new Field("J_URL_bai", strBody, Field.Store.YES, Field.Index.NO));
                                 document3.Add(new Field("J_addtime_bai", DateTime.Now.ToShortDateString(), Field.Store.YES, Field.Index.NO));
                                 document3.Add(new Field("J_md5_bai", this.MD5string(result.Doc.ToString()), Field.Store.YES, Field.Index.TOKENIZED));
                                 this.writer.AddDocument(document3);
                             }
                         }
                         else
                         {
                             this.cgcount++;
                             this.LogUri(bt, "引索完成");
                             document3 = new Lucene.Net.Documents.Document();
                             document3.Add(new Field("分类", this.page_py, Field.Store.YES, Field.Index.TOKENIZED));
                             document3.Add(new Field("J_title_bai", bt, Field.Store.YES, Field.Index.TOKENIZED));
                             document3.Add(new Field("J_msgContent_bai", str7, Field.Store.YES, Field.Index.TOKENIZED));
                             document3.Add(new Field("J_SiteType_bai", result.SiteType.ToString(), Field.Store.YES, Field.Index.NO));
                             document3.Add(new Field("J_URL_bai", strBody, Field.Store.YES, Field.Index.NO));
                             document3.Add(new Field("J_addtime_bai", DateTime.Now.ToShortDateString(), Field.Store.YES, Field.Index.NO));
                             document3.Add(new Field("J_md5_bai", this.MD5string(result.Doc.ToString()), Field.Store.YES, Field.Index.TOKENIZED));
                             this.writer.AddDocument(document3);
                         }
                     }
                 }
                 item.SubItems[2].Text = "正在下载";
                 item.ForeColor = System.Drawing.Color.Black;
                 string input = "";
                 byte[] buffer = new byte[0x2800];
                 int nNum = 0;
                 while ((num5 = response.socket.Receive(buffer, 0, 0x2800, SocketFlags.None)) > 0)
                 {
                     nNum += num5;
                     if (flag)
                     {
                         input = input + Encoding.ASCII.GetString(buffer, 0, num5);
                     }
                     item.SubItems[4].Text = this.Commas(nNum);
                     if (response.ContentLength > 0)
                     {
                         item.SubItems[5].Text = '%' + ((100 - (((response.ContentLength - nNum) * 100) / response.ContentLength))).ToString();
                     }
                     if ((response.KeepAlive && (nNum >= response.ContentLength)) && (response.ContentLength > 0))
                     {
                         break;
                     }
                 }
                 if (response.KeepAlive)
                 {
                     str = str + "Connection kept alive to be used in subpages.\r\n";
                 }
                 else
                 {
                     response.Close();
                     str = str + "Connection closed.\r\n";
                 }
                 this.FileCount++;
                 this.ByteCount += nNum;
                 if ((this.ThreadsRunning && flag) && (uri.Depth < this.WebDepth))
                 {
                     str = str + "\r\nParsing page ...\r\n";
                     string pattern = "(href|HREF|src|SRC)[ ]*=[ ]*[\"'][^\"'#>]+[\"']";
                     MatchCollection matchs = new Regex(pattern).Matches(input);
                     obj2 = str;
                     str = string.Concat(new object[] { obj2, "Found: ", matchs.Count, " ref(s)\r\n" });
                     this.URLCount += matchs.Count;
                     foreach (Match match in matchs)
                     {
                         pattern = match.Value.Substring(match.Value.IndexOf('=') + 1).Trim(new char[] { '"', '\'', '#', ' ', '>' });
                         try
                         {
                             if (!(((pattern.IndexOf("..") == -1) && !pattern.StartsWith("/")) && pattern.StartsWith("http://")))
                             {
                                 pattern = new Uri(uri, pattern).AbsoluteUri;
                             }
                             this.Normalize(ref pattern);
                             MyUri uri2 = new MyUri(pattern);
                             if ((((uri2.Scheme != Uri.UriSchemeHttp) && (uri2.Scheme != Uri.UriSchemeHttps)) || ((uri2.Host.Split(new char[] { '.' })[1] != this.urllhost[1]) && this.KeepSameServer)) || !this.Compared_jpg(uri2.LocalPath.Substring(uri2.LocalPath.LastIndexOf('.') + 1).ToLower()))
                             {
                                 continue;
                             }
                             Global.URL = uri2.ToString();
                             if ((Global.BXBH != "") && (Redspider_link.bxbh() == 2))
                             {
                                 continue;
                             }
                             uri2.Depth = uri.Depth + 1;
                             if (this.EnqueueUri(uri2, true))
                             {
                                 str = str + uri2.AbsoluteUri + "\r\n";
                             }
                         }
                         catch (Exception)
                         {
                         }
                     }
                 }
             }
         }
     }
     catch (Exception exception)
     {
         this.LogError(uri.AbsoluteUri, str + exception.Message);
         request = null;
     }
     finally
     {
         this.EraseItem(item);
     }
 }
Exemplo n.º 27
0
        public static bool IsVisited(MyUri uri)
        {
            MysqlHelper helper = new MysqlHelper();

            return(helper.IsVisited(uri));
        }
Exemplo n.º 28
0
    public override UriCi ParseUri(string uri)
    {
        MyUri myuri = new MyUri(uri);

        UriCi ret = new UriCi();
        ret.url = myuri.Url;
        ret.ip = myuri.Ip;
        ret.port = myuri.Port;
        ret.get = new DictionaryStringString();
        foreach (var k in myuri.Get)
        {
            ret.get.Set(k.Key, k.Value);
        }
        return ret;
    }