//收集标题、作者、日期信息 static void collectInformation(ref string HTML, ref resources res) { //title int indexOfTitle = HTML.IndexOf("<title>"); if (indexOfTitle != -1 && HTML.IndexOf(" | 琉璃神社 ★ HACG</title>") != -1) { res.title = HTML.Substring(indexOfTitle + 7, HTML.IndexOf(" | 琉璃神社 ★ HACG</title>") - indexOfTitle - 7); } //datetime int indexOfDatetime = HTML.IndexOf("\"entry-date\" datetime=\""); if (indexOfDatetime != -1) { string datetime = HTML.Substring(indexOfDatetime + 23, HTML.IndexOf("\" pubdate>") - indexOfDatetime - 23); res.datetime = new DateTime( Convert.ToInt32(datetime.Substring(0, 4)), Convert.ToInt32(datetime.Substring(5, 2)), Convert.ToInt32(datetime.Substring(8, 2)), Convert.ToInt32(datetime.Substring(11, 2)), Convert.ToInt32(datetime.Substring(14, 2)), Convert.ToInt32(datetime.Substring(17, 2))); } //author int indexOfAuthor = HTML.IndexOf("发布的文章\" rel=\"author\">"); if (indexOfAuthor != -1) { res.author = HTML.Substring(indexOfAuthor + 20, HTML.IndexOf("</a></span></span>") - indexOfAuthor - 20); } }
//保存收集到的信息和资源 static void saveCollection(resources res) { string text = ""; text += "index:" + res.index + "\nurl:" + res.url + "\ntitle:" + res.title + "\nanthor:" + res.author + "\ndatetime:" + res.datetime + "\n"; if (res.numberOfMagnets != 0) { for (int i = 0; i < res.numberOfMagnets; i++) { text += (res.magnets[i] + "\n"); } } if (res.numberOfBaidupanLinks != 0) { for (int i = 0; i < res.numberOfBaidupanLinks; i++) { text += res.baidupanLinks[i].link; if (res.baidupanLinks[i].havePassword) { text += (" " + res.baidupanLinks[i].password + "\n"); } else { text += "\n"; } } } text += "\n"; FileStream file = new FileStream(fileLocation + "resourcesOfHACG.txt", FileMode.Append); byte[] data = System.Text.Encoding.Default.GetBytes(text); file.Write(data, 0, data.Length); file.Flush(); file.Close(); text = ""; if (res.numberOfMagnets != 0) { for (int i = 0; i < res.numberOfMagnets; i++) { text += (res.magnets[i] + "\n"); } } FileStream file2 = new FileStream(fileLocation + "magnets.txt", FileMode.Append); data = System.Text.Encoding.Default.GetBytes(text); file2.Write(data, 0, data.Length); file2.Flush(); file2.Close(); }
//收集磁力链接 static void collectMagnets(ref string HTML, ref resources res) { Regex magnet40RE = new Regex("[^a-zA-Z0-9/\"\'-.;?\\[_=]([a-z0-9]{40}|[A-Z0-9]{40})[^a-zA-Z0-9/\"\'-.:;?\\[\\]_=]"); Regex magnet32RE = new Regex("[^a-zA-Z0-9/\"\'-.;?\\[_=]([a-z0-9]{32}|[A-Z0-9]{32})[^a-zA-Z0-9/\"\'-.:;?\\[\\]_=]"); Regex partOfMagnetRE = new Regex("[^a-zA-Z0-9/\"\'-.;?\\[_=]([A-Z0-9]{10,30}|[a-z0-9]{10,30})[^a-zA-Z0-9/\"\'-.:;?\\[\\]_=]"); MatchCollection matches1, matches2, matches3; matches1 = magnet40RE.Matches(HTML); matches2 = magnet32RE.Matches(HTML); matches3 = partOfMagnetRE.Matches(HTML); if (matches1.Count != 0 || matches3.Count > 1) { for (int i = 0; i < matches1.Count; i++) { if (isResourcesLink(matches1[i].ToString(), "magnet")) { res.numberOfMagnets++; res.magnets[res.numberOfMagnets - 1] = "magnet:?xt=urn:btih:" + matches1[i].ToString().Substring(1, 40); } } for (int i = 0; i < matches2.Count; i++) { if (isResourcesLink(matches2[i].ToString(), "magnet")) { res.numberOfMagnets++; res.magnets[res.numberOfMagnets - 1] = "magnet:?xt=urn:btih:" + matches2[i].ToString().Substring(1, 32); } } for (int i = 0; i + 1 < matches3.Count;) { int length1 = matches3[i].ToString().Length, length2 = matches3[i + 1].ToString().Length; if (length1 + length2 == 40 || length1 + length2 == 32) { string combinedMagnet = matches3[i].ToString().Substring(1, length1 - 2) + matches3[i + 1].ToString().Substring(1, length2 - 2); if (isResourcesLink(combinedMagnet, "magnet")) { res.numberOfMagnets++; res.magnets[res.numberOfMagnets - 1] = "magnet:?xt=urn:btih:" + combinedMagnet; } i = i + 2; } else { i++; } } } }
//收集百度盘链接 static void collectBaidupan(ref string HTML, ref resources res) { Regex baidupanRE = new Regex("[^a-zA-Z0-9\"\'-.;:?=\\[\\]_][a-zA-Z0-9]{8}[^a-zA-Z0-9/\"\'-.;:?=\\[\\]_]"); Regex passwordRE = new Regex("[^a-z0-9][a-z0-9]{4}[^a-z0-9]"); MatchCollection matches; matches = baidupanRE.Matches(HTML); if (matches.Count != 0) { for (int i = 0; i < matches.Count; i++) { if (!isResourcesLink(matches[i].ToString(), "baidupan")) { continue; } if (res.numberOfBaidupanLinks >= 1 && res.baidupanLinks[res.numberOfBaidupanLinks - 1].link.IndexOf(matches[i].ToString().Substring(1, 8)) >= 0) { continue; } res.numberOfBaidupanLinks++; res.baidupanLinks[res.numberOfBaidupanLinks - 1].link = "http://pan.baidu.com/s/" + matches[i].ToString().Substring(1, 8); string password = HTML.Substring(matches[i].Index + 9, 6); if (passwordRE.IsMatch(password)) { res.baidupanLinks[res.numberOfBaidupanLinks - 1].havePassword = true; res.baidupanLinks[res.numberOfBaidupanLinks - 1].password = password.Substring(1, 4); } else { password = HTML.Substring(matches[i].Index + 9, 20); int indexOfPassword = password.IndexOf("密码"); if (indexOfPassword >= 0) { for (int j = 1; j < 3; j++) { if (passwordRE.IsMatch(password.Substring(indexOfPassword + j, 6))) { res.baidupanLinks[res.numberOfBaidupanLinks - 1].havePassword = true; res.baidupanLinks[res.numberOfBaidupanLinks - 1].password = password.Substring(indexOfPassword + j + 1, 4); } } } } } } }
//显示收集到的信息和资源 static void display(ref resources res) { Console.WriteLine("title:" + res.title); Console.WriteLine("datetime:" + res.datetime); Console.WriteLine("author:" + res.author); for (int i = 0; i < res.numberOfMagnets; i++) { Console.WriteLine(res.magnets[i]); } for (int i = 0; i < res.numberOfBaidupanLinks; i++) { Console.Write(res.baidupanLinks[i].link); if (res.baidupanLinks[i].havePassword) { Console.WriteLine(" " + res.baidupanLinks[i].password); } else { Console.WriteLine(); } } Console.WriteLine(); }
static void Main(string[] args) { string HTML = "", url = ""; if (MANUAL_INPUT) { input(); } Stopwatch HTTPRequestTimer = new Stopwatch(); Stopwatch HTTPAnalysisTimer = new Stopwatch(); int retryTimes = 0; int iStatrt = (INDEX_CUSTOMED ? INDEXS_START : START_INDEX); int iEnd = (INDEX_CUSTOMED ? INDEXS_LENGTH - 1 : END_INDEX); for (int i = iStatrt; i <= iEnd; i++) { int index = (INDEX_CUSTOMED ? INDEXS[i] : i); if (URL_CUSTOMED) { url = URL; i = END_INDEX; } else { url = "http://www.hacg.li/wp/" + index + ".html"; } resources res = new resources(); res.index = index; res.url = url; if (retryTimes == 0) { Console.WriteLine("正在加载 " + url); } try { HTTPRequestTimer.Start(); HTML = GetWebClient(url); HTTPRequestTimer.Stop(); HTTPAnalysisTimer.Start(); if (retryTimes > 0) { Console.WriteLine(); } retryTimes = 0; Console.WriteLine("加载成功"); collectInformation(ref HTML, ref res); collectMagnets(ref HTML, ref res); collectBaidupan(ref HTML, ref res); display(ref res); if (HTML.IndexOf("<pre>") >= 0) { exportCatalog(ref HTML, res.index, res.title); } saveCollection(res); HTTPAnalysisTimer.Stop(); } catch (WebException error) { res.errorMessage = error.Message; Console.WriteLine("加载失败," + error.Message); if (retryTimes < MAX_RETRY_TIMES) { retryTimes++; i--; Console.Write("第" + retryTimes + "次重试..."); } else { retryTimes = 0; Console.WriteLine("已达最大重试次数,加载仍然失败\n"); } //if (e.Status == WebExceptionStatus.ProtocolError) //{ // Console.WriteLine("Status Code : {0}", ((HttpWebResponse)error.Response).StatusCode); // Console.WriteLine("Status Description : {0}", ((HttpWebResponse)error.Response).StatusDescription); //} } } Console.WriteLine("获取HTTP所花时间:" + HTTPRequestTimer.Elapsed.ToString() + " 即" + HTTPRequestTimer.ElapsedMilliseconds + "毫秒"); Console.WriteLine("分析HTTP所花时间:" + HTTPAnalysisTimer.Elapsed.ToString() + " 即" + HTTPAnalysisTimer.ElapsedMilliseconds + "毫秒"); //Console.WriteLine("分析所占百分比:" + HTTPAnalysisTimer.ElapsedMilliseconds/(HTTPRequestTimer.ElapsedMilliseconds+HTTPAnalysisTimer.ElapsedMilliseconds)); Console.ReadKey(); }