public void Process(object queue) { csdnPostQueue = (Queue <Post>)queue; string content = BlogFunUtlity.GetURLContents(string.Format(baseUrl, 1)); MatchCollection mc = regIndex.Matches(content); foreach (Match item in mc) { Console.WriteLine("Enqueue one item."); GetPostList(item.Groups[1].Value); } }
private void GetPostList(string url) { Console.WriteLine("Processing {0}", url); int detailIndex = url.IndexOf("details"); string homePageUrl = url.Substring(0, detailIndex); string firstList = homePageUrl + "list/{0}"; int maxPage = 0; Regex regPage = new Regex("<a href=.*?list/(\\d+)\">.*?</a>"); string content = BlogFunUtlity.GetURLContents(string.Format(firstList, 1)); MatchCollection mc = regPage.Matches(content); if (mc.Count == 0) { maxPage = 0; } else { maxPage = int.Parse(mc[mc.Count - 1].Groups[1].Value); } Regex regItemInPage = new Regex("<div class=\"article_title\">[\\d\\D]*?<span class=\"link_title\">[\\d\\D]*?href=\"(.*?)\">([\\d\\D]*?)</a>"); for (int i = 0; i < maxPage; i++) { string pageContent = BlogFunUtlity.GetURLContents(string.Format(firstList, i + 1)); MatchCollection itemsInPage = regItemInPage.Matches(pageContent); foreach (Match item in itemsInPage) { string title = item.Groups[2].Value.Trim(); if (title.Contains("font")) { string[] TmpTitle = title.Split('>'); title = TmpTitle[TmpTitle.Length - 1]; } CheckItem(new BlogIndexItem(title, "http://blog.csdn.net" + item.Groups[1].Value.Trim())); } } }
private void processContent(BlogIndexItem item) { string content = BlogFunUtlity.GetURLContents(item.URL); Match artical = regContent.Match(content); string result = artical.Groups[1].Value; List <ContentSem> markList = new List <ContentSem>(); MatchCollection mc = regImage.Matches(result); foreach (Match imgItem in mc) { if (imgItem.Groups[1].Value.StartsWith("http:")) { ContentSem cs = new ContentSem("img", imgItem.Groups[1].Index, imgItem.Groups[1].Length, imgItem.Groups[1].Value); markList.Add(cs); } } mc = regCode.Matches(result); foreach (Match codeItem in mc) { ContentSem cs = new ContentSem("code", codeItem.Groups[0].Index, codeItem.Groups[0].Length, codeItem.Groups[0].Value); if (regImage.IsMatch(cs.Content)) { ; } else { markList.Add(cs); } } StringBuilder buffer = new StringBuilder(); if (markList.Count > 0) { IEnumerable <ContentSem> orderList = markList.OrderBy(c => c.Index); ProcessImageCode(orderList); int index = 0; foreach (var listItem in orderList) { buffer.Append(result.Substring(index, listItem.Index - index)); buffer.Append(listItem.Content); index = listItem.Index + listItem.Length; } buffer.Append(result.Substring(index, result.Length - index)); } else { buffer.Append(result); } string PostContent = buffer.ToString(); Post CsdnPost = new Post(); CsdnPost.Title = item.Title; CsdnPost.Content = PostContent; lock (csdnPostQueue) { csdnPostQueue.Enqueue(CsdnPost); } Console.WriteLine("Processing {0} Done.", item.URL); }