/// <summary> /// 处理新闻链接 /// 返回0:不操作超时 /// 返回1:非kduri操作超时 /// 返回2:kdurl操作超时 /// </summary> /// <param name="oldUri"></param> internal static int ParseNewsUri(MyUri oldUri) { int result = 0; if (oldUri.AbsoluteUri.Equals(@"http://sports.sina.com.cn/g/laliga/2013-08-08/12216712337.shtml")) { } string constructUri = string.Format(@"http://59.39.71.239:886/getnewscontent.aspx?password=Kcis123_AutoGetNewsContent&url={0}", oldUri.AbsoluteUri); MyUri newUri = new MyUri(constructUri); WebHelper webHelper = new WebHelper(); string oldResponseStr = webHelper.GetContent(oldUri); string newResponseStr = webHelper.GetContent(newUri); if (oldResponseStr.Equals("操作超时")) { result = 1; } if (newResponseStr.Equals("操作超时")) { result = 2; } GetNewsInfo(oldResponseStr, newResponseStr, oldUri); InsertNews(); InsertVisited(); return(result); }
/// <summary> /// 新闻的板块获取的跳转 /// </summary> /// <param name="oldResponseStr"></param> /// <returns></returns> private static string GetNewsForumTrans(string responseStr, MyUri uri) { string forum = null; if (uri.AbsoluteUri.Contains("sina.com")) { forum = GetNewsForum(responseStr, 1); } else if (uri.AbsoluteUri.Contains("qq.com")) { forum = GetNewsForum(responseStr, 2); } else if (uri.AbsoluteUri.Contains("sohu.com")) { forum = GetNewsForum(responseStr, 3); } else if (uri.AbsoluteUri.Contains("163.com")) { forum = GetNewsForum(responseStr, 4); } else if (uri.AbsoluteUri.Contains("ifeng.com")) { forum = GetNewsForum(responseStr, 5); } return(forum); }
/// <summary> /// 判断链接是否有价值,包括是否已被访问过 /// </summary> /// <param name="newUri"></param> /// <returns></returns> private bool Isvalueable(MyUri newUri) { bool isNewsExist = false; bool isValueable = false; MysqlHelper mysqlHelper = new MysqlHelper(); isNewsExist = mysqlHelper.IsNewsExist(newUri.AbsoluteUri); if (!isNewsExist) { string[] ContainArray = { ".shtml", "html", "htm" }; foreach (string contain in ContainArray) { if (newUri.AbsoluteUri.EndsWith(contain) && (!newUri.AbsoluteUri.Contains(@"163.com/special")) && (!newUri.AbsoluteUri.Contains(@"http://comment.ifeng.com/")) && (!newUri.AbsoluteUri.Contains(@"http://comment2.news.sohu.com")) && (!newUri.AbsoluteUri.Contains(@"http://news.ifeng.com/photo")) && (!newUri.AbsoluteUri.Contains(@"http://slide.mil.news.sina.com.cn")) && (!newUri.AbsoluteUri.Contains(@"slide.news.sina")) && (!newUri.AbsoluteUri.Contains("video.sina"))) { isValueable = true; break; } } } else { UpdateUri(newUri); } return(isValueable); }
/// <summary> /// 读取指定URL地址,获取内容 /// </summary> public string GetContent(MyUri uri) { if (uri.AbsoluteUri == "http://comment.news.sohu.com/djpm/") { return(getHtml2(uri)); } else { return(getHtml1(uri)); } }
private MyUri GetNewUri(string strRef, MyUri uri) { if (strRef.IndexOf("..") != -1 || strRef.StartsWith("/") == true || strRef.StartsWith("http://") == false) { strRef = new Uri(uri, strRef).AbsoluteUri; } MyUri newUri = new MyUri(strRef); newUri.Depth = uri.Depth + 1; return(newUri); }
private static string getHtml2(MyUri uri) { string read = null; HttpWebResponse response = null; HttpWebRequest wr = null; WebProxy proxy = new WebProxy("proxy2.nfdaily.com:8080", false); proxy.Credentials = new NetworkCredential(@"zhangchi", "123789", "nfdaily"); StringBuilder s = new StringBuilder(102400); bool isGet = true; int time = 0; do { try { wr = (HttpWebRequest)WebRequest.Create(uri.AbsoluteUri); //wr.Timeout = 5000; Thread.Sleep(30); wr.Headers[HttpRequestHeader.AcceptEncoding] = "gzip, deflate"; //wr.Proxy = proxy; DateTime now = DateTime.Now; response = (HttpWebResponse)wr.GetResponse(); TimeSpan t = DateTime.Now - now; //log4net.ILog log = log4net.LogManager.GetLogger("MyLogger"); //log.Debug(string.Format("{0}请求响应时间--->{1}",uri.AbsoluteUri, t.ToString())); //Console.WriteLine(t); //wr.Abort(); time++; } catch (Exception ex) { isGet = false; log4net.ILog log = log4net.LogManager.GetLogger("MyLogger"); log.Debug(string.Format("错误信息{0}--->{1}", uri.AbsoluteUri, ex.Message)); read = ex.Message; } } while (isGet == false && time < 7); string[] keys = response.Headers.AllKeys; GZipStream g = new GZipStream(response.GetResponseStream(), CompressionMode.Decompress); StreamReader test = new StreamReader(g, Encoding.GetEncoding(uri.Encoding)); if (read != null) { read = test.ReadToEnd(); read = Regex.Match(read, "<style>.*<style>", RegexOptions.Singleline).Value; } //if (response != null) // response.Close(); //if (wr != null) // wr.Abort(); return(read); }
/// <summary> /// 获取一个新的新闻链接 /// </summary> /// <returns></returns> private MyUri DequeueNewsUri() { Monitor.Enter(_UriNewsQueue); MyUri newsUri = null; if (_UriNewsQueue.Count != 0) { newsUri = _UriNewsQueue.Dequeue(); _lastDequeueNewsTime = DateTime.Now; } Monitor.Exit(_UriNewsQueue); return(newsUri); }
public static void GetHashCode_Some() { Assert.Equal(1.GetHashCode(), One.GetHashCode()); Assert.Equal(2.GetHashCode(), Two.GetHashCode()); Assert.Equal(2L.GetHashCode(), TwoL.GetHashCode()); #if !(NETSTANDARD2_0 || NETSTANDARD1_x || NETFRAMEWORK) // GetHashCode(StringComparison) Assert.Equal(MyText.GetHashCode(StringComparison.Ordinal), SomeText.GetHashCode()); #endif Assert.Equal(MyUri.GetHashCode(), SomeUri.GetHashCode()); var anyT = AnyT.New(); Assert.Equal(anyT.Value.GetHashCode(), anyT.Some.GetHashCode()); }
/// <summary> /// 处理新闻链接的线程运行时执行的函数 /// </summary> /// <param name="obj"></param> private void ThreadFunction_news(object obj) { while (true) { MyUri newsUri = DequeueNewsUri(); if (newsUri != null) { NewsHelper newsHelper = new NewsHelper(); int result = NewsHelper.ParseNewsUri(newsUri); _allKdUrls += 1; if (result == 1) { _timeOutNormalUrl += 1; } else if (result == 2) { _timeOutKdUrl += 1; } } else if (isFinished()) { AbordNewsThreads(); break; } else { Thread.Sleep(1 * 1000); } } int threadLiving = 0; foreach (var thread in _threadsNews) { if (thread.ThreadState == ThreadState.Aborted || thread.ThreadState == ThreadState.Stopped) { threadLiving++; } } if (threadLiving <= 1) { log4net.ILog log = log4net.LogManager.GetLogger("MyLogger"); log.Debug(string.Format("结束采集{0}", DateTime.Now)); log.Debug(string.Format("全部链接数{0}", _allKdUrls)); log.Debug(string.Format("全部链接数{0}", _allKdUrls)); log.Debug(string.Format("kd超时链接数{0}", _timeOutKdUrl)); log.Debug(string.Format("门户超时链接数{0}", _timeOutNormalUrl)); } }
/// <summary> /// 从种子队列中出链接 /// </summary> /// <returns></returns> private MyUri DequeueSeedUri() { MyUri uri = null; Monitor.Enter(_UriSeedQueue); try { uri = (MyUri)_UriSeedQueue.Dequeue(); } catch (Exception) { } Monitor.Exit(_UriSeedQueue); return(uri); }
/// <summary> /// 查找所有的种子 /// </summary> internal List <MyUri> SelectAllSeeds() { List <MyUri> seeds = new List <MyUri>(); string selectSeeds_sql = string.Format(@"select * from info_seed "); DataSet testDataSet = null; MySqlConnection conn = new MySqlConnection(connStr); try { conn.Open(); // 创建一个适配器 MySqlDataAdapter adapter = new MySqlDataAdapter(selectSeeds_sql, conn); // 创建DataSet,用于存储数据. testDataSet = new DataSet(); // 执行查询,并将数据导入DataSet. adapter.Fill(testDataSet, "result_data"); } // 关闭数据库连接. catch (Exception e) { log4net.ILog log = log4net.LogManager.GetLogger("MyLogger"); log.Debug(e.Message); } finally { conn.Close(); } if (testDataSet != null) { foreach (DataRow testRow in testDataSet.Tables["result_data"].Rows) { string seedUrl = testRow["seed_url"].ToString(); int id = int.Parse(testRow["ID"].ToString()); if (seedUrl != "") { MyUri seed = new MyUri(seedUrl); seed.Encoding = testRow["seed_encoding"].ToString(); seed.Id = id; seed.Name = testRow["seed_name"].ToString(); seeds.Add(seed); } } } return(seeds); }
private static string GetNewsTimeTrans(string responseStr, MyUri uri) { string time = null; if (uri.AbsoluteUri.Contains("163.com") || uri.AbsoluteUri.Contains("qq.com")) { time = GetNewsTime(responseStr, 1); } else if (uri.AbsoluteUri.Contains("sina.com")) { time = GetNewsTime(responseStr, 2); } else if (uri.AbsoluteUri.Contains("sohu.com") || uri.AbsoluteUri.Contains("ifeng.com")) { time = GetNewsTime(responseStr, 3); } return(time); }
/// <summary> /// 解析 /// </summary> /// <param name="uri"></param> private void ParseSeedUri(MyUri seedUri) { WebHelper webHelper = new WebHelper(); string contentStr = webHelper.GetContent(seedUri); if (contentStr == null) { Console.WriteLine("获取不到种子列表信息"); log4net.ILog log = log4net.LogManager.GetLogger("MyLogger"); log.Debug(string.Format("{0}--->{1}", "获取不到种子列表信息", seedUri.AbsoluteUri)); } else { List <MyUri> newsUris = GetNewsUri(contentStr, seedUri); Console.WriteLine(seedUri.AbsoluteUri + "--->" + newsUris.Count + "--->" + DateTime.Now); log4net.ILog log = log4net.LogManager.GetLogger("MyLogger"); log.Debug(seedUri.AbsoluteUri + "--->" + newsUris.Count + "--->" + DateTime.Now); EnqueueNewsUris(newsUris); } }
internal bool IsVisited(MyUri uri) { /// <summary> /// 判断帖子是否已经被访问过 /// </summary> /// <param name="uri"></param> /// <returns></returns> int count = 0; string sel_forum = string.Format(@"select * from info_visited where visited_url = '{0}'", uri.AbsoluteUri); MySqlConnection conn = new MySqlConnection(connStr); try { // 创建一个适配器 MySqlDataAdapter adapter = new MySqlDataAdapter(sel_forum, conn); // 创建DataSet,用于存储数据. DataSet testDataSet = new DataSet(); // 执行查询,并将数据导入DataSet. adapter.Fill(testDataSet, "result_data"); // 关闭数据库连接. conn.Close(); if (testDataSet.Tables["result_data"].Rows.Count == 0) { return(false); } else { return(true); } } catch (Exception e) { log4net.ILog log = log4net.LogManager.GetLogger("MyLogger"); log.Debug(e.Message); } finally { conn.Close(); } return(false); }
/// <summary> /// 处理种子链接的线程运行时执行的函数 /// </summary> /// <param name="obj"></param> private void ThreadFunction_seed(object obj) { while (true) { MyUri uri = DequeueSeedUri(); if (uri != null) { Thread.Sleep(1 * 1000); ParseSeedUri(uri); } else if (IsSeedHandled()) { AbordSeedThreads(); break; } else { Thread.Sleep(1 * 1000); } } }
/// <summary> /// 获取新闻链接 /// </summary> /// <param name="contentStr"></param> /// <returns></returns> private List <MyUri> GetNewsUri(string contentStr, MyUri uri) { string content = Regex.Replace(contentStr, @"<script(\s[^>]*?)?>[\s\S]*?</script>", "", RegexOptions.IgnoreCase); string strRef = @"(href|HREF|src|SRC)[ ]*=[ ]*[""'][^""'#>]+[""']"; List <MyUri> matchcollections = new List <MyUri>(); MatchCollection matches = new Regex(strRef).Matches(content); foreach (Match match in matches) { strRef = GetRef(match); MyUri newUri = GetNewUri(strRef, uri); newUri.IsVisited = IsVisited(newUri); if (!newUri.IsVisited && Isvalueable(newUri)) { newUri.Name = uri.Name; newUri.Encoding = uri.Encoding; newUri.Fk_seed = uri.Id; matchcollections.Add(newUri); } } return(matchcollections); }
/// <summary> /// 更新新闻纪录的插入时间 /// </summary> /// <param name="newUri"></param> private void UpdateUri(MyUri newUri) { MysqlHelper mysqlHelper = new MysqlHelper(); mysqlHelper.UpdateInsertTime(newUri.AbsoluteUri); }
private void ParseFolder(string folderName, int nDepth) { DirectoryInfo info = new DirectoryInfo(folderName); FileInfo[] files = info.GetFiles("*.txt"); foreach (FileInfo info2 in files) { if (!this.ThreadsRunning) { break; } MyUri uri = new MyUri(info2.FullName) { Depth = nDepth }; this.EnqueueUri(uri, true); } DirectoryInfo[] directories = info.GetDirectories(); foreach (DirectoryInfo info3 in directories) { if (!this.ThreadsRunning) { break; } this.ParseFolder(info3.FullName, nDepth + 1); } }
private static string getHtml1(MyUri uri) { string tempCode = null; string error = null; int time = 0; HttpWebResponse response = null; HttpWebRequest request = null; do //访问失败时重新访问,最多重新访问4次 { WebProxy proxy = new WebProxy("proxy2.nfdaily.com:8080", false); proxy.Credentials = new NetworkCredential(@"zhangchi", "123789", "nfdaily"); request = HttpWebRequest.Create(uri.AbsoluteUri) as HttpWebRequest; Thread.Sleep(30); //request.Timeout = 5000; //request.Headers[HttpRequestHeader.AcceptEncoding] = "gzip,deflate"; //request.AllowAutoRedirect = false; request.AllowAutoRedirect = true; //request.Proxy = proxy; int a = 0; time += 1; error = null; try { DateTime now = DateTime.Now; response = request.GetResponse() as HttpWebResponse; TimeSpan t = DateTime.Now - now; //log4net.ILog log = log4net.LogManager.GetLogger("MyLogger"); //log.Debug(string.Format("{0}请求响应时间--->{1}",uri.AbsoluteUri, t.ToString())); //Console.WriteLine(t); //request.Abort(); } catch (Exception ex) { log4net.ILog log = log4net.LogManager.GetLogger("MyLogger"); log.Debug(string.Format("错误信息{0}--->{1}", uri.AbsoluteUri, ex.Message)); tempCode = ex.Message; //Console.WriteLine(ex.Message); //error = ex.Message; ////response = ex.Response as HttpWebResponse; //Thread.Sleep(5 * 1000); } } while (error != null && time < 7); System.IO.Stream resStream = null; StreamReader sr = null; try { resStream = response.GetResponseStream(); if (uri.AbsoluteUri.Contains("http://59.39.71.239:886/")) { sr = new StreamReader(resStream, Encoding.UTF8); } else { sr = new StreamReader(resStream, Encoding.GetEncoding(uri.Encoding)); } tempCode = sr.ReadToEnd(); response.Close(); resStream.Close(); sr.Close(); if (uri.AbsoluteUri == "http://news.163.com/rank/") { string pattern = @"<h2>全站</h2>.*<h2>科技</h2>"; tempCode = Regex.Match(tempCode, pattern, RegexOptions.Singleline).Value; } if (uri.AbsoluteUri == "http://news.qq.com/paihang.htm") { tempCode = Regex.Match(tempCode, "<tbody>.*<tbody>", RegexOptions.Singleline).Value; } if (uri.AbsoluteUri == "http://news.ifeng.com/hotnews/") { tempCode = Regex.Match(tempCode, "shtml.*<h4>资讯</h4>", RegexOptions.Singleline).Value; } if (uri.AbsoluteUri == "http://news.sina.com.cn/hotnews/") { tempCode = Regex.Match(tempCode, "<!--seo内容输出开始-->.*<!--seo内容输出结束-->", RegexOptions.Singleline).Value; } } catch (Exception e) { log4net.ILog log = log4net.LogManager.GetLogger("MyLogger"); log.Debug(string.Format("错误信息{0}--->{1}", uri.AbsoluteUri, e.Message)); } //if (response != null) // response.Close(); //if (request != null) // request.Abort(); return(tempCode); }
private bool AddURL(ref MyUri uri) { foreach (string str in this.ExcludeHosts) { if ((str.Trim().Length > 0) && (uri.Host.ToLower().IndexOf(str.Trim()) != -1)) { this.LogError(uri.AbsoluteUri, "\r\nHost excluded as it includes reserved pattern (" + str + ")"); return false; } } Monitor.Enter(this.urlStorage); bool flag = false; try { string absoluteUri = uri.AbsoluteUri; flag = this.urlStorage.Add(ref absoluteUri).Count == 1; } catch (Exception) { } Monitor.Exit(this.urlStorage); return flag; }
/// <summary> /// 获取新闻的信息 /// </summary> /// <param name="newsContentStr"></param> private static void GetNewsInfo(string oldResponseStr, string newResponseStr, MyUri uri) { _news.Source = uri.Name; _news.Fk_seed_news = uri.Fk_seed; if (_news.Fk_seed_news == null || _news.Fk_seed_news == 0) { Console.WriteLine("{0}的外键---〉{1}", uri.AbsoluteUri, uri.Fk_seed); } _news.Url = uri.AbsoluteUri; if (_news.Url == null || _news.Url == "") { Console.WriteLine("{0}的链接", uri.AbsoluteUri); } if (newResponseStr != null && newResponseStr != "" && oldResponseStr != null && oldResponseStr != "") { _news.Content = GetNewsContent(newResponseStr); _news.Title = GetNewsTitle(newResponseStr); //_news.Summary = GetNewsSummary(oldResponseStr); //_news.Source = GetNewsSource(oldResponseStr); //还没实现 _news.Forum = GetNewsForumTrans(oldResponseStr, uri); _news.Time = GetNewsTimeTrans(oldResponseStr, uri); } }
private void RunParser() { this.ThreadsRunning = true; try { long num = long.Parse(Global.COLUMN_ID.ToString()); OleDbConnection connection = new OleDbConnection(CONN_ACCESS.ConnString); OleDbCommand command = new OleDbCommand("SELECT * FROM Web_URL where id=" + num, connection); connection.Open(); OleDbDataReader reader = command.ExecuteReader(CommandBehavior.CloseConnection); string path = ""; while (reader.Read()) { path = reader["web_url"].ToString().Replace("\r", ""); Global.BM = reader["bm"].ToString(); Global.BXBH = reader["bxbh"].ToString(); Global.BDBH = reader["bdbh"].ToString(); Global.WEB_COLUMN_ID = reader["id"].ToString(); Global.WEB_COLUMN_NAME = reader["column_name"].ToString(); this.page_py = reader["column_name"].ToString(); Global.WEB_PARENT_ID = reader["parentid"].ToString(); Global.WEB_CODE = reader["code"].ToString(); Global.URL_BIANHAO = reader["id"].ToString(); this.zl_yes = reader["class"].ToString(); Global.ONEURL = path; this.urllhost = path.ToString().Split(new char[] { '.' }); if (Directory.Exists(path)) { this.ParseFolder(path, 0); } else { if (!System.IO.File.Exists(path)) { this.Normalize(ref path); this.comboBoxWeb.Text = path; } MyUri uri = new MyUri(path); this.EnqueueUri(uri, false); } } } catch (Exception exception) { this.LogError(exception.Message, exception.Message); return; } this.toolBarButtonContinue.Enabled = false; }
private bool IsVisited(MyUri newUri) { return(VisitedHelper.IsVisited(newUri)); }
public void ShowUri(string uri) { MyUri.ShowUri(uri); }
private bool EnqueueUri(MyUri uri, bool bCheckRepetition) { if (!(!bCheckRepetition || this.AddURL(ref uri))) { return false; } Monitor.Enter(this.queueURLS); try { this.queueURLS.Enqueue(uri); } catch (Exception) { } Monitor.Exit(this.queueURLS); return true; }
private void ParseUri(MyUri uri, ref MyWebRequest request) { string str = ""; if ((request != null) && request.response.KeepAlive) { str = str + "连接转至: " + uri.Host + "\r\n\r\n"; } else { str = str + "连接: " + uri.Host + "\r\n\r\n"; } ListViewItem item = null; Monitor.Enter(this.listViewThreads); try { item = this.listViewThreads.Items[int.Parse(Thread.CurrentThread.Name)]; item.SubItems[1].Text = uri.Depth.ToString(); item.ImageIndex = 1; item.BackColor = System.Drawing.Color.WhiteSmoke; item.SubItems[2].Text = "正在连接"; item.ForeColor = System.Drawing.Color.Red; item.SubItems[3].Text = uri.AbsoluteUri; item.SubItems[4].Text = ""; item.SubItems[5].Text = ""; } catch (Exception) { } Monitor.Exit(this.listViewThreads); try { object obj2; request = MyWebRequest.Create(uri, request, this.KeepAlive); request.Timeout = this.RequestTimeout * 0x3e8; MyWebResponse response = request.GetResponse(); str = str + request.Header + response.Header; if (!response.ResponseUri.Equals(uri)) { this.EnqueueUri(new MyUri(response.ResponseUri.AbsoluteUri), true); obj2 = str; str = string.Concat(new object[] { obj2, "重定向到: ", response.ResponseUri, "\r\n" }); request = null; } else { if ((!this.AllMIMETypes && (response.ContentType != null)) && (this.MIMETypes.Length > 0)) { string str2 = response.ContentType.ToLower(); int index = str2.IndexOf(';'); if (index != -1) { str2 = str2.Substring(0, index); } if ((str2.IndexOf('*') == -1) && ((index = this.MIMETypes.IndexOf(str2)) == -1)) { this.LogError(uri.AbsoluteUri, str + "\r\nUnlisted Content-Type (" + str2 + "), check settings."); request = null; return; } Match match = new Regex(@"\d+").Match(this.MIMETypes, index); int num3 = int.Parse(match.Value) * 0x400; int num4 = int.Parse(match.NextMatch().Value) * 0x400; if ((num3 < num4) && ((response.ContentLength < num3) || (response.ContentLength > num4))) { this.LogError(uri.AbsoluteUri, string.Concat(new object[] { str, "\r\nContentLength limit error (", response.ContentLength, ")" })); request = null; return; } } string[] strArray = new string[] { ".gif", ".jpg", ".css", ".zip", ".exe" }; bool flag = true; foreach (string str3 in strArray) { if (uri.AbsoluteUri.ToLower().EndsWith(str3)) { flag = false; break; } } foreach (string str3 in this.ExcludeFiles) { if ((str3.Trim().Length > 0) && uri.AbsoluteUri.ToLower().EndsWith(str3)) { flag = false; break; } } string strBody = uri.ToString(); if (this.Compared(uri.LocalPath.Substring(uri.LocalPath.LastIndexOf('.') + 1).ToLower()) && (uri.ToString().Substring(uri.ToString().Length - 1, 1) != "/")) { this.LogError("丢弃--非网页文件", strBody); } else { int num5; UriKind absolute = UriKind.Absolute; if (!string.IsNullOrEmpty(strBody) && Uri.IsWellFormedUriString(strBody, absolute)) { string page = GetPage(strBody); Stopwatch stopwatch = new Stopwatch(); stopwatch.Start(); Html html = new Html { Web = page, Url = strBody }; CommonAnalyze analyze = new CommonAnalyze(); analyze.LoadHtml(html); Net.LikeShow.ContentAnalyze.Document result = analyze.GetResult(); stopwatch.Stop(); string bt = result.Title.Replace("[(title)]", ""); switch (bt) { case null: case "": bt = result.Doc.Substring(20).ToString(); break; } if ((result.Doc == null) || (result.Doc == "")) { this.LogError("丢弃--空内容或非内空页", strBody); } else { Lucene.Net.Documents.Document document3; string str7 = result.Doc + bt; if (this.cgcount >= 10) { string keywords = this.MD5string(result.Doc.ToString()); string keyWordsSplitBySpace = ""; IndexSearcher searcher = new IndexSearcher(this.path); keyWordsSplitBySpace = GetKeyWordsSplitBySpace(keywords, new KTDictSegTokenizer()); Query query = new QueryParser("J_md5_bai", new KTDictSegAnalyzer(true)).Parse(keyWordsSplitBySpace); if (searcher.Search(query).Doc(0).Get("J_md5_bai") == keywords) { this.LogError("排除--重复", strBody); } else { this.cgcount++; this.LogUri(bt, "引索完成"); document3 = new Lucene.Net.Documents.Document(); document3.Add(new Field("分类", this.page_py, Field.Store.YES, Field.Index.TOKENIZED)); document3.Add(new Field("J_title_bai", bt, Field.Store.YES, Field.Index.TOKENIZED)); document3.Add(new Field("J_msgContent_bai", str7, Field.Store.YES, Field.Index.TOKENIZED)); document3.Add(new Field("J_SiteType_bai", result.SiteType.ToString(), Field.Store.YES, Field.Index.NO)); document3.Add(new Field("J_URL_bai", strBody, Field.Store.YES, Field.Index.NO)); document3.Add(new Field("J_addtime_bai", DateTime.Now.ToShortDateString(), Field.Store.YES, Field.Index.NO)); document3.Add(new Field("J_md5_bai", this.MD5string(result.Doc.ToString()), Field.Store.YES, Field.Index.TOKENIZED)); this.writer.AddDocument(document3); } } else { this.cgcount++; this.LogUri(bt, "引索完成"); document3 = new Lucene.Net.Documents.Document(); document3.Add(new Field("分类", this.page_py, Field.Store.YES, Field.Index.TOKENIZED)); document3.Add(new Field("J_title_bai", bt, Field.Store.YES, Field.Index.TOKENIZED)); document3.Add(new Field("J_msgContent_bai", str7, Field.Store.YES, Field.Index.TOKENIZED)); document3.Add(new Field("J_SiteType_bai", result.SiteType.ToString(), Field.Store.YES, Field.Index.NO)); document3.Add(new Field("J_URL_bai", strBody, Field.Store.YES, Field.Index.NO)); document3.Add(new Field("J_addtime_bai", DateTime.Now.ToShortDateString(), Field.Store.YES, Field.Index.NO)); document3.Add(new Field("J_md5_bai", this.MD5string(result.Doc.ToString()), Field.Store.YES, Field.Index.TOKENIZED)); this.writer.AddDocument(document3); } } } item.SubItems[2].Text = "正在下载"; item.ForeColor = System.Drawing.Color.Black; string input = ""; byte[] buffer = new byte[0x2800]; int nNum = 0; while ((num5 = response.socket.Receive(buffer, 0, 0x2800, SocketFlags.None)) > 0) { nNum += num5; if (flag) { input = input + Encoding.ASCII.GetString(buffer, 0, num5); } item.SubItems[4].Text = this.Commas(nNum); if (response.ContentLength > 0) { item.SubItems[5].Text = '%' + ((100 - (((response.ContentLength - nNum) * 100) / response.ContentLength))).ToString(); } if ((response.KeepAlive && (nNum >= response.ContentLength)) && (response.ContentLength > 0)) { break; } } if (response.KeepAlive) { str = str + "Connection kept alive to be used in subpages.\r\n"; } else { response.Close(); str = str + "Connection closed.\r\n"; } this.FileCount++; this.ByteCount += nNum; if ((this.ThreadsRunning && flag) && (uri.Depth < this.WebDepth)) { str = str + "\r\nParsing page ...\r\n"; string pattern = "(href|HREF|src|SRC)[ ]*=[ ]*[\"'][^\"'#>]+[\"']"; MatchCollection matchs = new Regex(pattern).Matches(input); obj2 = str; str = string.Concat(new object[] { obj2, "Found: ", matchs.Count, " ref(s)\r\n" }); this.URLCount += matchs.Count; foreach (Match match in matchs) { pattern = match.Value.Substring(match.Value.IndexOf('=') + 1).Trim(new char[] { '"', '\'', '#', ' ', '>' }); try { if (!(((pattern.IndexOf("..") == -1) && !pattern.StartsWith("/")) && pattern.StartsWith("http://"))) { pattern = new Uri(uri, pattern).AbsoluteUri; } this.Normalize(ref pattern); MyUri uri2 = new MyUri(pattern); if ((((uri2.Scheme != Uri.UriSchemeHttp) && (uri2.Scheme != Uri.UriSchemeHttps)) || ((uri2.Host.Split(new char[] { '.' })[1] != this.urllhost[1]) && this.KeepSameServer)) || !this.Compared_jpg(uri2.LocalPath.Substring(uri2.LocalPath.LastIndexOf('.') + 1).ToLower())) { continue; } Global.URL = uri2.ToString(); if ((Global.BXBH != "") && (Redspider_link.bxbh() == 2)) { continue; } uri2.Depth = uri.Depth + 1; if (this.EnqueueUri(uri2, true)) { str = str + uri2.AbsoluteUri + "\r\n"; } } catch (Exception) { } } } } } } catch (Exception exception) { this.LogError(uri.AbsoluteUri, str + exception.Message); request = null; } finally { this.EraseItem(item); } }
public static bool IsVisited(MyUri uri) { MysqlHelper helper = new MysqlHelper(); return(helper.IsVisited(uri)); }
public override UriCi ParseUri(string uri) { MyUri myuri = new MyUri(uri); UriCi ret = new UriCi(); ret.url = myuri.Url; ret.ip = myuri.Ip; ret.port = myuri.Port; ret.get = new DictionaryStringString(); foreach (var k in myuri.Get) { ret.get.Set(k.Key, k.Value); } return ret; }