protected virtual bool ProcessOnePage(string url, IndexContent indexContent, ref PageInfo pi) { string content = string.Empty; Encoding encoding = null; string rooturl = FileDownloadUtil.GetRootUrl(HttpMainHost, CurrentUrl, url); int count = 0; bool res = false; while (count < Retry) { res = FileDownloadUtil.DownloadWebPage(rooturl, url, ref content, ref encoding); if (res) { break; } Logger.Instance.Error(string.Format("Download Page Failed! [{0}] = {1}", count, url)); count++; } if (!res) { Logger.Instance.Error(string.Format("Download Page Failed! [{0}]", url)); return(false); } else { pi = GetPageInfo(content, indexContent); return(true); } }
private List <IndexContent> GetIndex(string htmlContent) { Logger.Instance.Info("Enter GetIndex"); List <IndexContent> indexList = new List <IndexContent>(); htmlContent = htmlContent.ToLower(); string pattern = @"<dd><a href=""(?<url>.*?)"" title=""(?<title>.*?)"">(?<titleContent>.*?)</a></dd>"; MatchCollection collections = Regex.Matches(htmlContent, pattern); foreach (Match match in collections) { IndexContent content = new IndexContent(); content.Url = match.Groups["url"].Value; content.Title = match.Groups["title"].Value; content.TitleContent = match.Groups["titleContent"].Value; content.LineContent = match.Value; indexList.Add(content); } Logger.Instance.Info("Exit GetIndex"); return(indexList); }
private PageInfo GetPageInfo(string htmlContent, IndexContent indexContent) { Logger.Instance.Info("Enter GetPageInfo"); PageInfo ps = new PageInfo(); ps.IndexContent = indexContent; htmlContent = htmlContent.ToLower(); string pattern_preview_page = @"var preview_page = ""(?<matchValue>.*?)"";"; string pattern_next_page = @"var next_page = ""(?<matchValue>.*?)"";"; string pattern_index_page = @"var index_page = ""(?<matchValue>.*?)"";"; string pattern_bookid = @"var bookid = ""(?<matchValue>.*?)"";"; string pattern_readid = @"var readid = ""(?<matchValue>.*?)"";"; string pattern_images = string.Format(@"<div class=""divimage""><img src=""(?<imageValue>{0}.*?)"" border=""0"" class=""imagecontent""></div>", HttpImageHost); Match match_preview_page = Regex.Match(htmlContent, pattern_preview_page); ps.preview_page = match_preview_page.Groups["matchValue"].Value; Match match_next_page = Regex.Match(htmlContent, pattern_next_page); ps.next_page = match_next_page.Groups["matchValue"].Value; Match match_index_page = Regex.Match(htmlContent, pattern_index_page); ps.index_page = match_index_page.Groups["matchValue"].Value; Match match_bookid = Regex.Match(htmlContent, pattern_bookid); ps.bookid = match_bookid.Groups["matchValue"].Value; Match match_readid = Regex.Match(htmlContent, pattern_readid); ps.readid = match_readid.Groups["matchValue"].Value; MatchCollection collections = Regex.Matches(htmlContent, pattern_images); foreach (Match match in collections) { string imageValue = match.Groups["imageValue"].Value; if (!string.IsNullOrEmpty(imageValue)) { ps.ImageList.Add(imageValue); } } Logger.Instance.Info("Exit GetPageInfo"); return(ps); }
//** Need to Change **** protected override List <IndexContent> GetIndex(string htmlContent) { Logger.Instance.Info("Enter GetIndex"); List <IndexContent> indexList = new List <IndexContent>(); htmlContent = htmlContent.ToLower(); htmlContent = htmlContent.Replace(System.Environment.NewLine, ""); //string pattern = @"<dd><a href=""(?<url>.*?)"">(?<titleContent>.*?)</a></dd>"; string [] patterns = new string[] { @"<td class=""ccss"">[\s]*<a href=""(?<url>.*?)"">(?<titleContent>.*?)</a>[\s]*</td>", @"<td class=""ccss""><div class=""dccss""><a href=""(?<url>.*?)"" alt=""(?<titleVaue>.*?)"">(?<titleContent>.*?)</a></div></td>" }; MatchCollection collections = null; foreach (string pattern in patterns) { collections = Regex.Matches(htmlContent, pattern); if (collections != null && collections.Count > 0) { break; } } foreach (Match match in collections) { IndexContent content = new IndexContent(); content.Url = match.Groups["url"].Value; content.Title = match.Groups["titleVaue"].Value; content.TitleContent = match.Groups["titleContent"].Value; content.LineContent = match.Value; if (!string.IsNullOrEmpty(content.Url)) { string FileName = content.Url.Substring(content.Url.LastIndexOf("/") + 1, (content.Url.Length - content.Url.LastIndexOf("/") - 1)); if (!string.IsNullOrEmpty(FileName)) { content.FileName = FileName; } } indexList.Add(content); } Logger.Instance.Info("Exit GetIndex"); return(indexList); }
protected override List <IndexContent> GetIndex(string htmlContent) { Logger.Instance.Info("Enter GetIndex"); List <IndexContent> indexList = new List <IndexContent>(); htmlContent = htmlContent.ToLower(); string [] patterns = { @"<dd><a href=""(?<url>.*?)"" title=""(?<title>.*?)"">(?<titleContent>.*?)</a></dd>", @"<dd><a href=""(?<url>.*?)"">(?<titleContent>.*?)</a></dd>" }; MatchCollection collections = null; foreach (string pattern in patterns) { collections = Regex.Matches(htmlContent, pattern); if (collections != null && collections.Count > 0) { break; } } foreach (Match match in collections) { IndexContent content = new IndexContent(); content.Url = match.Groups["url"].Value; content.Title = string.IsNullOrEmpty(match.Groups["title"].Value) ? match.Groups["titleContent"].Value : match.Groups["title"].Value; content.TitleContent = match.Groups["titleContent"].Value; content.LineContent = match.Value; if (!string.IsNullOrEmpty(content.Url)) { string FileName = content.Url.Substring(content.Url.LastIndexOf("/") + 1, (content.Url.Length - content.Url.LastIndexOf("/") - 1)); if (!string.IsNullOrEmpty(FileName)) { content.FileName = FileName; } } indexList.Add(content); } Logger.Instance.Info("Exit GetIndex"); return(indexList); }
private bool ProcessOnePage(string url, IndexContent indexContent, ref PageInfo pi) { string content = string.Empty; Encoding encoding = null; if (!FileDownloadUtil.DownloadWebPage(HttpMainHost, url, ref content, ref encoding)) { Logger.Instance.Error(string.Format("Download Page Failed! [{0}]", url)); return(false); } else { pi = GetPageInfo(content, indexContent); return(true); } }
protected override List <IndexContent> GetIndex(string htmlContent) { Logger.Instance.Info("Enter GetIndex"); List <IndexContent> indexList = new List <IndexContent>(); htmlContent = htmlContent.ToLower(); htmlContent = htmlContent.Replace(System.Environment.NewLine, ""); string pattern = @"<dd><a href=""(?<url>.*?)"">(?<titleContent>.*?)</a></dd>"; MatchCollection collections = Regex.Matches(htmlContent, pattern); foreach (Match match in collections) { IndexContent content = new IndexContent(); content.Url = match.Groups["url"].Value; content.Title = match.Groups["titleContent"].Value; content.TitleContent = match.Groups["titleContent"].Value; content.LineContent = match.Value; if (!string.IsNullOrEmpty(content.Url)) { string FileName = content.Url.Substring(content.Url.LastIndexOf("/") + 1, (content.Url.Length - content.Url.LastIndexOf("/") - 1)); if (!string.IsNullOrEmpty(FileName)) { content.FileName = FileName; } } indexList.Add(content); } Logger.Instance.Info("Exit GetIndex"); return(indexList); }
protected override PageInfo GetPageInfo(string htmlContent, IndexContent indexContent) { Logger.Instance.Info("Enter GetPageInfo"); PageInfo ps = new PageInfo(); ps.IndexContent = indexContent; htmlContent = htmlContent.ToLower(); string pattern_preview_page = @"var preview_page = ""(?<matchValue>.*?)"";"; string pattern_next_page = @"var next_page = ""(?<matchValue>.*?)"";"; string pattern_index_page = @"var index_page = ""(?<matchValue>.*?)"";"; string pattern_bookid = @"var bookid = ""(?<matchValue>.*?)"";"; string pattern_readid = @"var readid = ""(?<matchValue>.*?)"";"; string pattern_images = string.Format(@"<div class=""divimage""><img src=""(?<imageValue>{0}.*?)"" border=""0"" class=""imagecontent""></div>", HttpImageHost); string pattern_text = "<div class=\"width\">(?<textValue>((?!</div>)[\\s\\S])*)<table align="; string pattern_Script = "<script>(?<scriptValue>[\\s\\S]*)</script>"; Match match_preview_page = Regex.Match(htmlContent, pattern_preview_page); ps.preview_page = match_preview_page.Groups["matchValue"].Value; Match match_next_page = Regex.Match(htmlContent, pattern_next_page); ps.next_page = match_next_page.Groups["matchValue"].Value; Match match_index_page = Regex.Match(htmlContent, pattern_index_page); ps.index_page = match_index_page.Groups["matchValue"].Value; Match match_bookid = Regex.Match(htmlContent, pattern_bookid); ps.bookid = match_bookid.Groups["matchValue"].Value; Match match_readid = Regex.Match(htmlContent, pattern_readid); ps.readid = match_readid.Groups["matchValue"].Value; MatchCollection collections = Regex.Matches(htmlContent, pattern_images); if (collections.Count > 0) { ps.ContentType = ContentTypeEnum.Image; foreach (Match match in collections) { string imageValue = match.Groups["imageValue"].Value; if (!string.IsNullOrEmpty(imageValue)) { ps.ImageList.Add(imageValue); } } } else { Match match = Regex.Match(htmlContent, pattern_text); string textValue = match.Groups["textValue"].Value; if (!string.IsNullOrEmpty(textValue)) { match = Regex.Match(textValue, pattern_Script); string scriptValue = match.Groups["scriptValue"].Value; if (!string.IsNullOrEmpty(scriptValue)) { textValue = textValue.Replace(scriptValue, ""); } } ps.ContentType = ContentTypeEnum.Text; ps.TextContent = textValue; } Logger.Instance.Info("Exit GetPageInfo"); return(ps); }
protected abstract PageInfo GetPageInfo(string htmlContent, IndexContent indexContent);
protected virtual void Run(ref Queue <IndexContent> indexQueue, ref List <PageInfo> pageInfoList, ref int count, ref int failed, Encoding encoding, string tempContent, int indexListCount, DateTime oneJobStart, ref double totalTime) { threadPools[System.Threading.Thread.CurrentThread.ManagedThreadId] = true; while (!StopJob) { if (count > mMaxPages) { Logger.Instance.InfoImportant("Reach Max Pages: " + mMaxPages); break; } bool onePageRes = true; IndexContent indexContent = null; lock (indexQueue) { if (indexQueue.Count > 0) { indexContent = indexQueue.Dequeue(); } else { break; } } count++; Logger.Instance.Info("Processing Record: " + count); string file = string.Format("{0}/{1}", mSaveToRoot, indexContent.FileName); if (File.Exists(file) && mUpdateMode) { Logger.Instance.InfoImportant("Update Mode File Exists: " + new FileInfo(file).Name); mLastExistIndexContent = indexContent; continue; } PageInfo piLastExists = null; if (mLastExistIndexContent != null && mUpdateMode) { if (ProcessOnePage(mLastExistIndexContent.Url, mLastExistIndexContent, ref piLastExists)) { if (piLastExists != null && piLastExists.ValidPageInfor()) { pageInfoList.Add(piLastExists); DownloadImages(piLastExists, !mUpdateMode); if (piLastExists.ValidPageContent()) { BuildPage(piLastExists, tempContent, encoding); } } else { Logger.Instance.InfoImportant("Invalid File Content: (could be a coding issue)" + new FileInfo(file).Name); } } mLastExistIndexContent = null; } PageInfo pi = null; if (ProcessOnePage(indexContent.Url, indexContent, ref pi)) { if (pi != null && pi.ValidPageInfor()) { pageInfoList.Add(pi); DownloadImages(pi, !mUpdateMode); if (pi.ValidPageContent()) { if (!BuildPage(pi, tempContent, encoding)) { onePageRes = false; } } } else { Logger.Instance.InfoImportant("Invalid File Content: (could be a coding issue)" + new FileInfo(file).Name); onePageRes = false; } } else { onePageRes = false; } if (!onePageRes) { failed++; } int timeRemaining = ((count == 0 || count == 1) ? 0 : (int)((indexListCount - count - 1) * totalTime / count - 1)); FireOnProgressChangedEvent(indexListCount, count, timeRemaining, failed, ProgressEnum.Progressing); DateTime oneJobEnd = DateTime.Now; totalTime = oneJobEnd.Subtract(oneJobStart).TotalSeconds; } threadPools[System.Threading.Thread.CurrentThread.ManagedThreadId] = false; lock (indexQueue) { indexQueue.Clear(); bool result = true; foreach (int key in threadPools.Keys) { if (threadPools[key]) { result = false; } } if (result) { FireOnProgressChangedEvent(indexListCount, count, ProgressEnum.FinishOK); } } }
protected virtual bool ProcessToGetPageInfosOneByOne(ref List <PageInfo> pageInfoList, ref List <IndexContent> indexList, ref Encoding encoding) { int count = 0; int failed = 0; pageInfoList = new List <PageInfo>(); if (ProcessIndex(ref indexList, ref encoding)) { if (indexList != null) { Logger.Instance.Info("Build Index!"); BuildIndex(indexList, encoding, mNovelTitle); Logger.Instance.Info("Get Page Infor List OK!"); FireOnProgressChangedEvent(indexList.Count, 0, ProgressEnum.Progressing); if (!Directory.Exists(mSaveToRoot)) { Directory.CreateDirectory(mSaveToRoot); } DirectoryInfo di = new DirectoryInfo(mSaveToRoot); Logger.Instance.Info("Save To: " + di.FullName); StreamReader SR = new StreamReader(new FileStream(htmlPageTemplate, FileMode.Open, FileAccess.Read)); string tempContent = SR.ReadToEnd(); SR.Close(); if (encoding == null) { encoding = Encoding.UTF8; } int total = indexList.Count > mMaxPages ? mMaxPages : indexList.Count; double totalTime = 0; DateTime oneJobStart = DateTime.Now; bool onePageRes = true; foreach (IndexContent indexContent in indexList) { onePageRes = true; if (StopJob) { break; } count++; if (Argument.StartFromIndex > 0 && count < Argument.StartFromIndex) { continue; } if (count > mMaxPages) { Logger.Instance.InfoImportant("Reach Max Pages: " + mMaxPages); break; } Logger.Instance.Info("Processing Record: " + count); string file = string.Format("{0}/{1}", mSaveToRoot, indexContent.FileName); if (File.Exists(file) && mUpdateMode) { Logger.Instance.InfoImportant("Update Mode File Exists: " + new FileInfo(file).Name); mLastExistIndexContent = indexContent; continue; } PageInfo piLastExists = null; if (mLastExistIndexContent != null && mUpdateMode) { if (ProcessOnePage(mLastExistIndexContent.Url, mLastExistIndexContent, ref piLastExists)) { if (piLastExists != null && piLastExists.ValidPageInfor()) { pageInfoList.Add(piLastExists); DownloadImages(piLastExists, !mUpdateMode); if (piLastExists.ValidPageContent()) { BuildPage(piLastExists, tempContent, encoding); } } else { Logger.Instance.InfoImportant("Invalid File Content: (could be a coding issue)" + new FileInfo(file).Name); } } mLastExistIndexContent = null; } PageInfo pi = null; if (ProcessOnePage(indexContent.Url, indexContent, ref pi)) { if (pi != null && pi.ValidPageInfor()) { pageInfoList.Add(pi); DownloadImages(pi, !mUpdateMode); if (pi.ValidPageContent()) { if (!BuildPage(pi, tempContent, encoding)) { onePageRes = false; } } } else { Logger.Instance.InfoImportant("Invalid File Content: (could be a coding issue)" + new FileInfo(file).Name); onePageRes = false; } } else { onePageRes = false; } if (!onePageRes) { failed++; } int timeRemaining = ((count == 0 || count == 1) ? 0 : (int)((indexList.Count - count - 1) * totalTime / count - 1)); FireOnProgressChangedEvent(indexList.Count, count, timeRemaining, failed, ProgressEnum.Progressing); DateTime oneJobEnd = DateTime.Now; totalTime = oneJobEnd.Subtract(oneJobStart).TotalSeconds; } return(true); } } Logger.Instance.Info("Get Page Infor List Failed!"); return(false); }
protected override PageInfo GetPageInfo(string htmlContent, IndexContent indexContent) { Logger.Instance.Info("Enter GetPageInfo"); PageInfo ps = new PageInfo(); ps.IndexContent = indexContent; htmlContent = htmlContent.ToLower(); string pattern_preview_page = @"var preview_page = ""(?<matchValue>.*?)"";"; string pattern_next_page = @"var next_page = ""(?<matchValue>.*?)"";"; string pattern_index_page = @"var index_page = ""(?<matchValue>.*?)"";"; string pattern_bookid = @"var article_id = ""(?<matchValue>.*?)"";"; string pattern_readid = @"var chapter_id = ""(?<matchValue>.*?)"";"; string pattern_images = string.Format(@"<div class=""divimage""><img src=""(?<imageValue>{0}.*?)"" border=""0"" class=""imagecontent""></div>", HttpImageHost); //string pattern_text = "<div id=\"content\" name=\"content\">(?<textValue>((?!</div>)[\\s\\S])*)</div>"; string pattern_text = "<div id=\"txtright\"><script type=\"text/javascript\">txtrightshow\\(\\);</script></div>(?<textValue>((?!</div>)(?!</pre>)[\\s\\S])*)</pre>"; string[] pattern_Scripts = new string[] { "\\[<a id=(?<scriptValue>((?!</a>)[\\s\\S])*)</a>\\]", "<a id=(?<scriptValue>((?!</a>)[\\s\\S])*)</a>" }; Match match_preview_page = Regex.Match(htmlContent, pattern_preview_page); ps.preview_page = GetFileNameFromUrl(match_preview_page.Groups["matchValue"].Value); Match match_next_page = Regex.Match(htmlContent, pattern_next_page); ps.next_page = GetFileNameFromUrl(match_next_page.Groups["matchValue"].Value); Match match_index_page = Regex.Match(htmlContent, pattern_index_page); ps.index_page = "index.html"; Match match_bookid = Regex.Match(htmlContent, pattern_bookid); ps.bookid = match_bookid.Groups["matchValue"].Value; Match match_readid = Regex.Match(htmlContent, pattern_readid); ps.readid = match_readid.Groups["matchValue"].Value; MatchCollection collections = Regex.Matches(htmlContent, pattern_images); if (collections.Count > 0) { ps.ContentType = ContentTypeEnum.Image; foreach (Match match in collections) { string imageValue = match.Groups["imageValue"].Value; if (!string.IsNullOrEmpty(imageValue)) { ps.ImageList.Add(imageValue); } } } else { htmlContent = htmlContent.Replace(System.Environment.NewLine, "<br/>"); Match match = Regex.Match(htmlContent, pattern_text); string textValue = match.Groups["textValue"].Value; if (!string.IsNullOrEmpty(textValue)) { foreach (string pattern_Script in pattern_Scripts) { textValue = Regex.Replace(textValue, pattern_Script, ""); } textValue = RemoveInvalidString(textValue); } ps.ContentType = ContentTypeEnum.Text; ps.TextContent = textValue; } Logger.Instance.Info("Exit GetPageInfo"); return(ps); }
private static int CompareIndexContent(IndexContent ic1, IndexContent ic2) { return(string.Compare(ic1.FileName, ic2.FileName)); }
protected override List <IndexContent> GetIndex(string htmlContent) { Logger.Instance.Info("Enter GetIndex"); List <IndexContent> indexList = new List <IndexContent>(); htmlContent = htmlContent.ToLower(); htmlContent = htmlContent.Replace(System.Environment.NewLine, ""); string[] patterns = new string[] { @"<dd>([\s])*<a href=""(?<url>((?!<a href)[\s\S])*)"">(?<titleContent>((?!<a href)[\s\S])*)</a>[\s]*<a href=""(?<url_2>((?!<a href)[\s\S])*)"">(?<titleContent_2>((?!<a href)[\s\S])*)</a>[\s]*</dd>", @"<dd>([\s])*<a href=""(?<url>((?!<a href)[\s\S])*)"">(?<titleContent>((?!<a href)[\s\S])*)</a>((?!<a href)[\s\S])*</dd>" }; List <Match> collections = new List <Match>(); foreach (string pattern in patterns) { MatchCollection collectionsTemp = Regex.Matches(htmlContent, pattern); if (collectionsTemp != null && collectionsTemp.Count > 0) { Match[] matches = new Match[collectionsTemp.Count]; collectionsTemp.CopyTo(matches, 0); collections.AddRange(matches); } } foreach (Match match in collections) { IndexContent content = new IndexContent(); content.Url = match.Groups["url"].Value; content.Title = match.Groups["titleContent"].Value; content.TitleContent = match.Groups["titleContent"].Value; content.LineContent = match.Value; if (!string.IsNullOrEmpty(content.Url)) { string FileName = content.Url.Substring(content.Url.LastIndexOf("/") + 1, (content.Url.Length - content.Url.LastIndexOf("/") - 1)); if (!string.IsNullOrEmpty(FileName)) { content.FileName = FileName; } } indexList.Add(content); IndexContent content2 = new IndexContent(); content2.Url = match.Groups["url_2"].Value; content2.Title = match.Groups["titleContent_2"].Value; content2.TitleContent = match.Groups["titleContent_2"].Value; content2.LineContent = match.Value; if (!string.IsNullOrEmpty(content2.Url)) { string FileName = content2.Url.Substring(content2.Url.LastIndexOf("/") + 1, (content2.Url.Length - content2.Url.LastIndexOf("/") - 1)); if (!string.IsNullOrEmpty(FileName)) { content2.FileName = FileName; } indexList.Add(content2); } } Logger.Instance.Info("Exit GetIndex"); indexList.Sort(CompareIndexContent); return(indexList); }