public void Start() { string url_format = "https://www.ptt.cc/bbs/Gossiping/index{0}.html"; for (int i = 1; i < 50; i++) { HttpWebRequest request = HttpWebRequest.CreateHttp(string.Format(url_format, i)); request.Timeout = 10000;//设置10秒超时 request.Proxy = null; request.Method = "GET"; request.UserAgent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"; request.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"; request.CookieContainer = new CookieContainer(); request.CookieContainer.Add(new Cookie() { Name = "over18", Value = "1", Domain = "www.ptt.cc", Expired = false }); string html = null; using (WebResponse response = request.GetResponse()) { using (var reader = new StreamReader(response.GetResponseStream(), Encoding.UTF8)) { html = reader.ReadToEnd(); reader.Close(); } response.Close(); } Debug.WriteLine(html); HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(html); foreach (var node in doc.DocumentNode.SelectNodes("//a")) { if (node.Attributes["href"] != null && !string.IsNullOrEmpty(node.Attributes["href"].Value) && node.Attributes["href"].Value.Contains("M.")) { Handle(node); } } using (var db = new WordBreakerDataContext()) db.DeleteKeywords(1); } }
private void Handle(HtmlNode node) { string url = (null == node.Attributes["href"]) ? null : node.Attributes["href"].Value; url = string.Format("https://www.ptt.cc{0}", url); HttpWebRequest request = HttpWebRequest.CreateHttp(url); request.Timeout = 10000;//设置10秒超时 request.Proxy = null; request.Method = "GET"; request.UserAgent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"; request.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"; request.CookieContainer = new CookieContainer(); request.CookieContainer.Add(new Cookie() { Name = "over18", Value = "1", Domain = "www.ptt.cc", Expired = false }); string html = null; using (WebResponse response = request.GetResponse()) { using (var reader = new StreamReader(response.GetResponseStream(), Encoding.UTF8)) { html = reader.ReadToEnd(); reader.Close(); } response.Close(); } byte[] b = Encoding.UTF8.GetBytes(url); string pHmac = MD5HashedBase64(b); long? url_id = 0; using (var db = new WordBreakerDataContext()) db.InsertVisitedUrl(url, pHmac, ref url_id); if ((url_id ?? 0) == 0) { return; } HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(html); var d = doc.GetElementbyId("main-content"); if (null != d) { List <HtmlNode> removing_nodes = new List <HtmlNode>(); foreach (var d2 in d.ChildNodes) { if (d2.Attributes["class"] != null && d2.Attributes["class"].Value == "push") { string push_direct = ""; var d3 = d2.SelectSingleNode("span[@class='f1 hl push-tag']"); if (d3 == null) { d3 = d2.SelectSingleNode("span[@class='hl push-tag']"); } if (d3 != null) { push_direct = d3.InnerText.Trim(); } var push_str = CleanString(d2.SelectSingleNode("span[@class='f3 push-content']").InnerText.Trim()); float positive_score = 0; switch (push_direct) { case "噓": positive_score = -1; break; case "推": positive_score = 1; break; } using (var db = new WordBreakerDataContext()) for (int i = 0; i < push_str.Length; i++) { if ((i + 1) < push_str.Length) { db.InsertKeyword(push_str.Substring(i, 2), positive_score, url_id); } else { db.InsertKeyword(push_str.Substring(i), positive_score, url_id); } } removing_nodes.Add(d2); } else { removing_nodes.Add(d2); } } foreach (var r in removing_nodes) { d.ChildNodes.Remove(r); } string content = CleanString(d.InnerText); using (var db = new WordBreakerDataContext()) for (int i = 0; i < content.Length; i++) { if ((i + 1) < content.Length) { db.InsertKeyword(content.Substring(i, 2), 0, url_id); } else { db.InsertKeyword(content.Substring(i), 0, url_id); } } } }