public void Process(Crawler crawler, PropertyBag propertyBag) { foreach (Uri uri in seeds) { crawler.AddStep(uri, 2); } }
protected virtual void AddStepToCrawler(Crawler crawler, PropertyBag propertyBag, string normalizedLink, string link) { crawler.AddStep(new Uri(normalizedLink), propertyBag.Step.Depth + 1, propertyBag.Step, new Dictionary <string, object> { { Resources.PropertyBagKeyOriginalUrl, link }, { Resources.PropertyBagKeyOriginalReferrerUrl, propertyBag.ResponseUri } }); }
public void Process(Crawler crawler, PropertyBag propertyBag) { if (propertyBag.StatusCode != HttpStatusCode.OK) { return; } if (!IsXmlContent(propertyBag.ContentType)) { return; } using (Stream reader = propertyBag.GetResponse()) using (StreamReader sr = new StreamReader(reader)) { XDocument mydoc = XDocument.Load(sr); if (mydoc.Root == null) { return; } XName qualifiedName = XName.Get("loc", "http://www.sitemaps.org/schemas/sitemap/0.9"); IEnumerable<string> urlNodes = from e in mydoc.Descendants(qualifiedName) where !e.Value.IsNullOrEmpty() && e.Value.StartsWith("http://", StringComparison.OrdinalIgnoreCase) select e.Value; foreach (string url in urlNodes) { // add new crawler steps string baseUrl = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path); string decodedLink = ExtendedHtmlUtility.HtmlEntityDecode(url); string normalizedLink = NormalizeLink(baseUrl, decodedLink); if (normalizedLink.IsNullOrEmpty()) { continue; } crawler.AddStep(new Uri(normalizedLink), propertyBag.Step.Depth + 1, propertyBag.Step, new Dictionary<string, object> { {Resources.PropertyBagKeyOriginalUrl, url}, {Resources.PropertyBagKeyOriginalReferrerUrl, propertyBag.ResponseUri} }); } } }
public void Process(Crawler crawler, PropertyBag propertyBag) { if (propertyBag.StatusCode != HttpStatusCode.OK) { return; } if (!IsXmlContent(propertyBag.ContentType)) { return; } using (Stream reader = propertyBag.GetResponse()) using (StreamReader sr = new StreamReader(reader)) { XDocument mydoc = XDocument.Load(sr); if (mydoc.Root == null) { return; } XName qualifiedName = XName.Get("loc", "http://www.sitemaps.org/schemas/sitemap/0.9"); IEnumerable <string> urlNodes = from e in mydoc.Descendants(qualifiedName) where !e.Value.IsNullOrEmpty() && e.Value.StartsWith("http://", StringComparison.OrdinalIgnoreCase) select e.Value; foreach (string url in urlNodes) { // add new crawler steps string baseUrl = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path); string decodedLink = ExtendedHtmlUtility.HtmlEntityDecode(url); string normalizedLink = NormalizeLink(baseUrl, decodedLink); if (normalizedLink.IsNullOrEmpty()) { continue; } crawler.AddStep(new Uri(normalizedLink), propertyBag.Step.Depth + 1, propertyBag.Step, new Dictionary <string, object> { { Resources.PropertyBagKeyOriginalUrl, url }, { Resources.PropertyBagKeyOriginalReferrerUrl, propertyBag.ResponseUri } }); } } }
public virtual void Process(Crawler crawler, PropertyBag propertyBag) { // Get text from previous pipeline step string text = propertyBag.Text; if (HasTextStripRules) { text = StripText(text); } if (text.IsNullOrEmpty()) { return; } if (HasLinkStripRules) { text = StripLinks(text); } // Find links MatchCollection matches = s_LinkRegex.Value.Matches(text); foreach (Match match in matches.Cast <Match>().Where(m => m.Success)) { string link = match.Value; if (link.IsNullOrEmpty()) { continue; } string baseUrl = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path); string normalizedLink = link.NormalizeUrl(baseUrl); if (normalizedLink.IsNullOrEmpty()) { continue; } // Add new step to crawler crawler.AddStep(new Uri(normalizedLink), propertyBag.Step.Depth + 1, propertyBag.Step, new Dictionary <string, object> { { Resources.PropertyBagKeyOriginalUrl, new Uri(link) }, { Resources.PropertyBagKeyOriginalReferrerUrl, propertyBag.ResponseUri } }); } }
public virtual void Process(Crawler crawler, PropertyBag propertyBag) { // Get text from previous pipeline step string text = propertyBag.Text; if (HasTextStripRules) { text = StripText(text); } if (text.IsNullOrEmpty()) { return; } if (HasLinkStripRules) { text = StripLinks(text); } // Find links MatchCollection matches = s_LinkRegex.Value.Matches(text); foreach (Match match in matches.Cast<Match>().Where(m => m.Success)) { string link = match.Value; if (link.IsNullOrEmpty()) { continue; } string baseUrl = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path); string normalizedLink = link.NormalizeUri(baseUrl); if (normalizedLink.IsNullOrEmpty()) { continue; } // Add new step to crawler crawler.AddStep(new Uri(normalizedLink), propertyBag.Step.Depth + 1, propertyBag.Step, new Dictionary<string, object> { {Resources.PropertyBagKeyOriginalUrl, new Uri(link)}, {Resources.PropertyBagKeyOriginalReferrerUrl, propertyBag.ResponseUri} }); } }
public void Process(Crawler crawler, PropertyBag propertyBag) { HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument; if (htmlDoc == null) { return; //this happens with HTTP errors etc. We don't bother with retrying or anything like that :( } int maxPage = CrawlUtil.getMaxReviewIFramePageNumber(htmlDoc); //add other review pages if at base (uri) page if (propertyBag.ResponseUri.ToString() == CrawlReviewIFrame.baseUri && maxPage != -1) { int maxPageToCrawl = maxPage; string uri = ""; //if (maxPage > 10) commenting this out means to crawl all review pages. //{ // maxPageToCrawl = 10; //} for (int i = 2; i <= maxPageToCrawl; i++) { uri = "http://www.goodreads.com/api/reviews_widget_iframe?did=DEVELOPER_ID&format=html&isbn=" + CrawlReviewIFrame.currentBook.isbn + "&links=660&min_rating=&page=" + i + "&review_back=fff&stars=000&text=000"; crawler.AddStep(new Uri(uri), 0); } CrawlReviewIFrame.form.appendLineToLog("Crawling " + maxPageToCrawl + " pages of reviews for " + CrawlReviewIFrame.currentBook.getShortTitle()); } //only process review iframe pages if (!propertyBag.ResponseUri.OriginalString.StartsWith(CrawlReviewIFrame.baseUri.Substring(0, 100))) { return; } lock (this) { string currentPage = "0"; var currentPageNode = htmlDoc.DocumentNode.SelectSingleNode("//*[@class='current']"); if (currentPageNode != null) { currentPage = currentPageNode.InnerText.Trim(); } var reviews = htmlDoc.DocumentNode.SelectNodes("//*[@itemtype='http://schema.org/Review']"); if (reviews == null || reviews.Count == 0) { return; } //**do stuff to handle dupes properly // -_- //current method just saves each review one by one and ignores all errors when trying to save. //this also means all reviews are attempted to be saved again no matter what :( GoodReadsCrawlerEntities context = CrawlUtil.getNewContext(); foreach (var r in reviews) { string reviewUrl; int reviewId = -1; Match match; var reviewLinkNode = r.SelectSingleNode(".//div[@class='gr_review_text']/link[@itemprop='url']"); DateTime publishDate = DateTime.MinValue; short starRating = 0; Review toAdd = new Review(); if (reviewLinkNode != null) { reviewUrl = reviewLinkNode.GetAttributeValue("href", "null"); match = regReview.Match(reviewUrl); if (Int32.TryParse(match.Groups[1].Value, out reviewId)) { if (CrawlReviewIFrame.existingReviewIds.Contains(reviewId)) { continue; } var node = r.SelectSingleNode(".//span[@class='gr_review_date']"); if (node != null) { DateTime.TryParse(node.InnerText, out publishDate); } node = r.SelectSingleNode(".//span[@class='gr_rating']"); if (node != null) { starRating = CrawlUtil.countStarsFromString(node.InnerText); } toAdd.id = reviewId; toAdd.bookId = CrawlReviewIFrame.currentBook.id; toAdd.publishDate = publishDate; toAdd.starRating = starRating; toAdd.foundOnPage = Int32.Parse(currentPage); toAdd.maxPage = maxPage; context.Reviews.AddObject(toAdd); } try { context.SaveChanges(); } catch (Exception ex) { context.Reviews.Detach(toAdd); CrawlReviewIFrame.form.appendLineToLog(ex.Message); if (ex.InnerException != null) { CrawlReviewIFrame.form.appendLineToLog("\t" + ex.InnerException.Message); } } } } CrawlReviewIFrame.form.appendLineToLog("Added " + reviews.Count + " on page " + currentPage + " of " + maxPage + " for " + CrawlReviewIFrame.currentBook.getShortTitle()); } }
public void Process(Crawler crawler, PropertyBag propertyBag) { //add pages only if at base uri page if (propertyBag.ResponseUri.ToString() == "http://www.goodreads.com/list/show/1.Best_Books_Ever?page=" + CrawlList.FromPage) { string uri = ""; for (int i = CrawlList.FromPage + 1; i <= CrawlList.ToPage; i++) { uri = "http://www.goodreads.com/list/show/1.Best_Books_Ever?page=" + i; crawler.AddStep(new Uri(uri), 0); CrawlList.form.appendLineToLog("also crawling " + uri); } } //only process list pages if (!propertyBag.ResponseUri.OriginalString.StartsWith("http://www.goodreads.com/list/show/1.Best_Books_Ever")) { return; } var s = propertyBag["HtmlDoc"].Value; HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument; if (htmlDoc != null) { lock (this) { var books = htmlDoc.DocumentNode.SelectNodes("//tr[@itemtype='http://schema.org/Book\']"); if (books == null || books.Count == 0) { return; } GoodReadsCrawlerEntities context = CrawlUtil.getNewContext(); foreach (var b in books) { string title = "null"; string authorName = "null"; var titleURLNode = b.SelectSingleNode(".//*[@class='bookTitle']"); var authorURLNode = b.SelectSingleNode(".//*[@class='authorName']"); string titleUrl = "null"; string authorUrl = "null"; Match match; string bookId = "-1"; string authorId = "-1"; Book newBook = null; Author author = null; if (titleURLNode != null && authorURLNode != null) { titleUrl = titleURLNode.GetAttributeValue("href", "null"); match = regBook.Match(titleUrl); bookId = match.Groups[1].Value; title = titleURLNode.InnerText.Trim(); authorUrl = authorURLNode.GetAttributeValue("href", "null"); match = regAuthor.Match(authorUrl); authorId = match.Groups[1].Value; authorName = authorURLNode.InnerText.Trim(); author = CrawlUtil.createOrGetAuthor(context, Int32.Parse(authorId), authorName); newBook = CrawlUtil.createOrGetBook(context, Int32.Parse(bookId), title); newBook.Author = author; //author.Book = newBook; } context.SaveChanges(); } CrawlList.form.appendLineToLog("added/updated " + books.Count + " books and their authors"); } } }
public void Process(Crawler crawler, PropertyBag propertyBag) { HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument; if (htmlDoc == null) { return; //this happens with HTTP errors etc. We don't bother with retrying or anything like that :( } HtmlNode doc = htmlDoc.DocumentNode; //on page 1, add other pages for user if (propertyBag.ResponseUri.OriginalString == CrawlListAndVotes.baseUri) { /* * * <div> * <span class="previous_page disabled">« previous</span> * <em class="current">1</em> * <a rel="next" href="/list/user_votes/1045275-natasha?page=2">2</a> * <a href="/list/user_votes/1045275-natasha?page=3">3</a> * <a href="/list/user_votes/1045275-natasha?page=4">4</a> * <a href="/list/user_votes/1045275-natasha?page=5">5</a> * <a class="next_page" rel="next" href="/list/user_votes/1045275-natasha?page=2">next »</a> * </div> */ var node = doc.SelectSingleNode(".//a[@class='next_page' and @rel='next']"); if (node != null) { try { var x = node.PreviousSibling.PreviousSibling; int maxPage = Int32.Parse(x.InnerText.Trim()); string uri; for (int i = 2; i <= maxPage; i++) { uri = "http://www.goodreads.com/list/user_votes/" + User.userIdString + "?page=" + i; crawler.AddStep(new Uri(uri), 0); CrawlListAndVotes.form.appendLineToLog(uri); } } catch (Exception ex) { CrawlListAndVotes.form.appendLineToLog(ex.Message); } } } lock (this) { GoodReadsCrawlerEntities context = CrawlUtil.getNewContext(); foreach (var listNode in doc.SelectNodes(".//div[@class='cell']")) { List l = null; string title = null; var titleNode = listNode.SelectSingleNode(".//a[@class='listTitle']"); if (titleNode != null) { title = titleNode.InnerText.Trim(); } if (title != null) { l = CrawlUtil.createOrGetList(context, title); } else { continue; } /* * 296 books * — * 994 voters */ var statsNode = listNode.SelectSingleNode(".//div[@class='listFullDetails']"); if (statsNode != null) { string s = statsNode.InnerText.Replace("\n", "").Trim(); l.numBooks = Convert.ToInt32(CrawlUtil.extractNumberFromString(s)); s = s.Substring(s.IndexOf("books")); l.numVoters = Convert.ToInt32(CrawlUtil.extractNumberFromString(s)); } User u = CrawlUtil.getUser(context, User.id); u.Lists.Add(l); try { context.SaveChanges(); CrawlListAndVotes.count++; } catch (Exception ex) { User.Lists.Remove(l); //this just prints out to check an inner exception which is a dupe PK error //CrawlListAndVotes.form.appendLineToLog(ex.Message); } } CrawlListAndVotes.form.appendLineToLog(User.userIdString + ":: " + CrawlListAndVotes.count + " lists added"); } }
public void Process(Crawler crawler, PropertyBag propertyBag) { AspectF.Define. NotNull(crawler, "crawler"). NotNull(propertyBag, "propertyBag"); if (propertyBag.StatusCode != HttpStatusCode.OK) { return; } if (!IsHtmlContent(propertyBag.ContentType)) { return; } HtmlDocument htmlDoc = new HtmlDocument { OptionAddDebuggingAttributes = false, OptionAutoCloseOnEnd = true, OptionFixNestedTags = true, OptionReadEncoding = true }; using (Stream reader = propertyBag.GetResponse()) { Encoding documentEncoding = htmlDoc.DetectEncoding(reader); reader.Seek(0, SeekOrigin.Begin); if (!documentEncoding.IsNull()) { htmlDoc.Load(reader, documentEncoding, true); } else { htmlDoc.Load(reader, true); } } string originalContent = htmlDoc.DocumentNode.OuterHtml; if (HasTextStripRules || HasSubstitutionRules) { string content = StripText(originalContent); content = Substitute(content, propertyBag.Step); using (TextReader tr = new StringReader(content)) { htmlDoc.Load(tr); } } propertyBag["HtmlDoc"].Value = htmlDoc; HtmlNodeCollection nodes = htmlDoc.DocumentNode.SelectNodes("//title"); // Extract Title if (!nodes.IsNull()) { propertyBag.Title = string.Join(";", nodes. Select(n => n.InnerText). ToArray()).Trim(); } // Extract Meta Data nodes = htmlDoc.DocumentNode.SelectNodes("//meta[@content and @name]"); if (!nodes.IsNull()) { propertyBag["Meta"].Value = ( from entry in nodes let name = entry.Attributes["name"] let content = entry.Attributes["content"] where !name.IsNull() && !name.Value.IsNullOrEmpty() && !content.IsNull() && !content.Value.IsNullOrEmpty() select name.Value + ": " + content.Value).ToArray(); } propertyBag.Text = htmlDoc.ExtractText().Trim(); if (HasLinkStripRules || HasTextStripRules) { string content = StripLinks(originalContent); using (TextReader tr = new StringReader(content)) { htmlDoc.Load(tr); } } // Extract Links DocumentWithLinks links = htmlDoc.GetLinks(); foreach (string link in links.Links.Union(links.References)) { if (link.IsNullOrEmpty()) { continue; } string baseUrl = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path); string decodedLink = ExtendedHtmlUtility.HtmlEntityDecode(link); string normalizedLink = NormalizeLink(baseUrl, decodedLink); if (normalizedLink.IsNullOrEmpty()) { continue; } crawler.AddStep(new Uri(normalizedLink), propertyBag.Step.Depth + 1, propertyBag.Step, new Dictionary<string, object> { {Resources.PropertyBagKeyOriginalUrl, link}, {Resources.PropertyBagKeyOriginalReferrerUrl, propertyBag.ResponseUri} }); } }
public void Process(Crawler crawler, PropertyBag propertyBag) { AspectF.Define. NotNull(crawler, "crawler"). NotNull(propertyBag, "propertyBag"); if (propertyBag.StatusCode != HttpStatusCode.OK) { return; } if (!IsHtmlContent(propertyBag.ContentType)) { return; } HtmlDocument htmlDoc = new HtmlDocument { OptionAddDebuggingAttributes = false, OptionAutoCloseOnEnd = true, OptionFixNestedTags = true, OptionReadEncoding = true }; using (MemoryStream reader = propertyBag.GetResponseStream()) { Encoding documentEncoding = htmlDoc.DetectEncoding(reader); reader.Seek(0, SeekOrigin.Begin); if (!documentEncoding.IsNull()) { htmlDoc.Load(reader, documentEncoding, true); } else { htmlDoc.Load(reader, true); } } string originalContent = htmlDoc.DocumentNode.OuterHtml; if (HasTextStripRules || HasSubstitutionRules) { string content = StripText(originalContent); content = Substitute(content, propertyBag.Step); using (TextReader tr = new StringReader(content)) { htmlDoc.Load(tr); } } HtmlNodeCollection nodes = htmlDoc.DocumentNode.SelectNodes("//title"); // Extract Title if (!nodes.IsNull()) { propertyBag.Title = string.Join(";", nodes. Select(n => n.InnerText). ToArray()).Trim(); } // Extract Meta Data nodes = htmlDoc.DocumentNode.SelectNodes("//meta[@content and @name]"); if (!nodes.IsNull()) { propertyBag["Meta"].Value = ( from entry in nodes let name = entry.Attributes["name"] let content = entry.Attributes["content"] where !name.IsNull() && !name.Value.IsNullOrEmpty() && !content.IsNull() && !content.Value.IsNullOrEmpty() select name.Value + ": " + content.Value).ToArray(); } propertyBag.Text = htmlDoc.ExtractText().Trim(); if (HasLinkStripRules || HasTextStripRules) { string content = StripLinks(originalContent); using (TextReader tr = new StringReader(content)) { htmlDoc.Load(tr); } } // Extract Links DocumentWithLinks links = htmlDoc.GetLinks(); foreach (string link in links.Links.Union(links.References)) { if (link.IsNullOrEmpty()) { continue; } string baseUrl = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path); string decodedLink = ExtendedHtmlUtility.HtmlEntityDecode(link); string normalizedLink = NormalizeLink(baseUrl, decodedLink); if (normalizedLink.IsNullOrEmpty()) { continue; } crawler.AddStep(new Uri(normalizedLink), propertyBag.Step.Depth + 1, propertyBag.Step, new Dictionary <string, object> { { Resources.PropertyBagKeyOriginalUrl, link }, { Resources.PropertyBagKeyOriginalReferrerUrl, propertyBag.ResponseUri } }); } }
public void Process(Crawler crawler, PropertyBag propertyBag) { HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument; if (htmlDoc == null) { return; //this happens with HTTP errors etc. We don't bother with retrying or anything like that :( } HtmlNode doc = htmlDoc.DocumentNode; string temp = propertyBag.ResponseUri.OriginalString; string temp2 = CrawlReviewsOnUserProfile.baseUri.Substring(0, 54 + User.userIdString.Length); //on page 1, add other pages for user if (propertyBag.ResponseUri.OriginalString == CrawlReviewsOnUserProfile.baseUri) { var node = doc.SelectSingleNode(".//div[@id='reviewPagination']"); if (node != null) { try { var x = node.LastChild.PreviousSibling.PreviousSibling; int maxPage = Int32.Parse(x.InnerText.Trim()); string uri; for (int i = 2; i <= maxPage; i++) { uri = "http://www.goodreads.com/review/list/" + User.userIdString + "?format=html&page=" + i + "&per_page=100&shelf=read"; crawler.AddStep(new Uri(uri), 0); CrawlReviewsOnUserProfile.form.appendLineToLog(uri); } } catch (Exception) { } } } lock (this) { GoodReadsCrawlerEntities context = CrawlUtil.getNewContext(); foreach (var reviewNode in doc.SelectNodes(".//tr[@class='bookalike review']")) { CrawlReviewsOnUserProfile.count++; string reviewIdString = reviewNode.GetAttributeValue("id", ""); if (reviewIdString == "") { return; } int reviewId = Int32.Parse(reviewIdString.Split('_').GetValue(1).ToString()); Review review = CrawlUtil.getReview(context, reviewId); //create and process the REVIEW if it doesn't already exist if (review == null) { HtmlNode node; review = new Review(); review.id = reviewId; //REVIEW.rating node = reviewNode.SelectSingleNode(".//td[@class='field rating']//img"); if (node != null) { string ratingString = node.GetAttributeValue("alt", "0"); short rating = short.Parse(ratingString.Substring(0, 1)); review.starRating = rating; } //REVIEW.publishdate node = reviewNode.SelectSingleNode(".//td[@class='field date_added']//span"); if (node != null) { DateTime date; DateTime.TryParse(node.InnerText, out date); review.publishDate = date; } //USER review.userId = User.id; review.userIdString = User.userIdString; //BOOK node = reviewNode.SelectSingleNode(".//td[@class='field title']//a"); string bookUrl = node.GetAttributeValue("href", ""); int bookId = CrawlUtil.extractIdNumberFromUrl(bookUrl); //if bookUrl is null then bookId gets set to 0 Book book = CrawlUtil.getBook(context, bookId); if (book == null) { book = new Book(); book.id = bookId; string title = node.GetAttributeValue("title", ""); book.title = title; node = reviewNode.SelectSingleNode(".//td[@class='field isbn']//div"); if (node != null) { book.isbn = node.InnerText.Trim(); } //AUTHOR node = reviewNode.SelectSingleNode(".//td[@class='field author']//a"); if (node != null) { string authorUrl = node.GetAttributeValue("href", ""); int authorId = CrawlUtil.extractIdNumberFromUrl(authorUrl); //if bookUrl is null then bookId gets set to 0 Author author = CrawlUtil.getAuthor(context, authorId); if (author == null) { author = new Author(); author.id = authorId; author.name = CrawlUtil.formatAuthorName(node.InnerText.Trim()); book.Author = author; } } } review.Book = book; context.SaveChanges(); } } CrawlReviewsOnUserProfile.form.appendLineToLog(User.userIdString + ":: " + CrawlReviewsOnUserProfile.count + " reviews crawled"); } }
public void Process(Crawler crawler, PropertyBag propertyBag) { HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument; if (htmlDoc == null) { return; //this happens with HTTP errors etc. We don't bother with retrying or anything like that :( } HtmlNode doc = htmlDoc.DocumentNode; string temp = propertyBag.ResponseUri.OriginalString; string temp2 = CrawlReviewsOnUserProfile_Updated.baseUri.Substring(0, 54 + User.userIdString.Length); //on page 1, add other pages for user if (propertyBag.ResponseUri.OriginalString == CrawlReviewsOnUserProfile_Updated.baseUri) { var node = doc.SelectSingleNode(".//div[@id='reviewPagination']"); if (node != null) { HtmlNode maxPageNode; int maxPage = 0; try { maxPageNode = node.LastChild.PreviousSibling.PreviousSibling; maxPage = Int32.Parse(maxPageNode.InnerText.Trim()); } catch (Exception) { CrawlReviewsOnUserProfile_Updated.form.appendLineToLog("Error getting maxPage on " + propertyBag.ResponseUri.OriginalString); return; } //get new reviews since last crawl? int pagesToCrawl = 0; if (CrawlReviewsOnUserProfile_Updated.getReviewsSinceLastCrawl) { pagesToCrawl = maxPage - (User.Reviews.Count / 20); //int division results in truncation if (pagesToCrawl < 1) { return; } /**** TEMP to get pages 30 and above (for users with more than 600 reviews (after getting up to 600 only on a previous run))****/ //for (int i = 30; i <= maxPage; i++) //{ // String s= "http://www.goodreads.com/review/list/" + User.userIdString + "?order=a&page=" + i + "&print=true&shelf=read&sort=date_added&view=reviews"; // crawler.AddStep(new Uri(s), 0); // CrawlReviewsOnUserProfile_Updated.form.appendLineToLog(s); //} //return; /*** Old logic pre 2015 11 30 *** * int startPage = (User.Reviews.Count / 20)+1; * string uri; * for (int i = startPage; i <= maxPage; i++) * { * uri = "http://www.goodreads.com/review/list/" + User.userIdString + "?order=a&page=" + i + "&print=true&shelf=read&sort=date_added&view=reviews"; * crawler.AddStep(new Uri(uri), 0); * * CrawlReviewsOnUserProfile_Updated.form.appendLineToLog(uri); * } * return; * *************/ } else //crawl every page { pagesToCrawl = maxPage; } string uri; for (int i = 2; i <= pagesToCrawl; i++) { //http://www.goodreads.com/review/list/1-otis-chandler?page=3&print=true&shelf=read uri = "http://www.goodreads.com/review/list/" + User.userIdString + "?page=" + i + "&print=true&shelf=read"; crawler.AddStep(new Uri(uri), 0); CrawlReviewsOnUserProfile_Updated.form.appendLineToLog(uri); } //continue to with crawl on page 1 unless endPage is 0 (i.e. no pages need to be crawled) //2015-11-30: getting X latest pages is now redudant since reviews are now always sorted by date added ascending. //feature removed for 2015 update 5 crawl //get reviews from specified pages; or latest X pages //get user's latest X pages of reviews if user has more than (maxPage * 20) reviews; X determined by getHowManyLatestPages //if (maxPage > CrawlReviewsOnUserProfile_Updated.maxPage) //{ // if (CrawlReviewsOnUserProfile_Updated.getHowManyLatestPages > 0) // { // int numLatestPages = Math.Min(CrawlReviewsOnUserProfile_Updated.getHowManyLatestPages, maxPage - CrawlReviewsOnUserProfile_Updated.maxPage); // for (int i = 0; i < numLatestPages; i++) // { // string uriLatest = "http://www.goodreads.com/review/list/" + User.userIdString + "?order=a&page=" + (maxPage - i) + "&print=true&shelf=read&sort=date_added&view=reviews"; // crawler.AddStep(new Uri(uriLatest), 0); // CrawlReviewsOnUserProfile_Updated.form.appendLineToLog(uriLatest); // } // } // maxPage = CrawlReviewsOnUserProfile_Updated.maxPage; //} //string u; //for (int i = CrawlReviewsOnUserProfile_Updated.minPage; i <= maxPage; i++) //{ // u = "http://www.goodreads.com/review/list/" + User.userIdString + "?order=a&page=" + i + "&print=true&shelf=read&sort=date_added&view=reviews"; // crawler.AddStep(new Uri(u), 0); // CrawlReviewsOnUserProfile_Updated.form.appendLineToLog(u); //} //if don't want to include page 1 then don't crawl after adding other pages to crawl //if (CrawlReviewsOnUserProfile_Updated.minPage > 1) //{ // return; //} } } lock (this) { GoodReadsCrawlerEntities context = CrawlUtil.getNewContext(); foreach (var reviewNode in doc.SelectNodes(".//tr[@class='bookalike review']")) { string reviewIdString = reviewNode.GetAttributeValue("id", ""); if (reviewIdString == "") { return; } int reviewId = Int32.Parse(reviewIdString.Split('_').GetValue(1).ToString()); //Review review = CrawlUtil.createOrGetReview(context, reviewId); Review review = CrawlUtil.getReview(context, reviewId); if (review == null) //review is new { review = new Review(); review.id = reviewId; context.Reviews.AddObject(review); } else //review already exists { continue; } HtmlNode node; //REVIEW.rating /*<td class="field rating"> * <label>Reb's rating</label> * <div class="value"> * <a class=" staticStars stars_4" title="really liked it">4 of 5 stars</a> * </div> * </td>*/ node = reviewNode.SelectSingleNode(".//td[@class='field rating']//a"); if (node != null) { string ratingClassString = node.GetAttributeValue("class", "0"); short rating = CrawlUtil.getRatingFromClassString(ratingClassString); review.starRating = rating; } //REVIEW.publishdate node = reviewNode.SelectSingleNode(".//td[@class='field date_added']//span"); if (node != null) { DateTime date; DateTime.TryParse(node.InnerText, out date); review.publishDate = date; } //USER review.userId = User.id; review.userIdString = User.userIdString; //BOOK node = reviewNode.SelectSingleNode(".//td[@class='field title']//a"); string bookUrl = node.GetAttributeValue("href", ""); int bookId = CrawlUtil.extractIdNumberFromUrl(bookUrl); //if bookUrl is null then bookId gets set to 0 Book book = CrawlUtil.getBook(context, bookId); if (book == null) { book = new Book(); book.id = bookId; string title = node.GetAttributeValue("title", ""); book.title = title; node = reviewNode.SelectSingleNode(".//td[@class='field isbn']//div"); if (node != null) { book.isbn = node.InnerText.Trim(); } //AUTHOR node = reviewNode.SelectSingleNode(".//td[@class='field author']//a"); if (node != null) { string authorUrl = node.GetAttributeValue("href", ""); int authorId = CrawlUtil.extractIdNumberFromUrl(authorUrl); //if bookUrl is null then bookId gets set to 0 Author author = CrawlUtil.getAuthor(context, authorId); if (author == null) { author = new Author(); author.id = authorId; author.name = CrawlUtil.formatAuthorName(node.InnerText.Trim()); book.Author = author; } } } review.Book = book; context.SaveChanges(); CrawlReviewsOnUserProfile_Updated.count++; } CrawlReviewsOnUserProfile_Updated.form.appendLineToLog(User.userIdString + ":: " + CrawlReviewsOnUserProfile_Updated.count + " reviews crawled"); } }
public void Process(Crawler crawler, PropertyBag propertyBag) { HtmlAgilityPack.HtmlDocument doc = propertyBag["HtmlDoc"].Value as HtmlAgilityPack.HtmlDocument; if (doc == null) { return; } var reviews = doc.DocumentNode.SelectNodes("//h2"); if (reviews == null) { return; } //On first page for each gender-age-group queue up other pages for the same group //First page is where ?start=<blank> if (CrawlUtil.getQueryValueFromUrl(propertyBag.ResponseUri.AbsoluteUri, "start") == null) { var matchingReviewsNode = doc.DocumentNode.SelectSingleNode("//td[@align='right']"); if (matchingReviewsNode != null) { //89 matching reviews (334 reviews in total) Regex r = new Regex(@"(\d+) (matching reviews|reviews in total)"); Match m = r.Match(matchingReviewsNode.InnerText); if (m != null) { int matchingReviews = 0; int.TryParse(m.Groups[1].Value, out matchingReviews); if (matchingReviews > 10) { for (int i = 10; i < matchingReviews; i += 10) { Uri add = new Uri(propertyBag.ResponseUri.AbsoluteUri + "&start=" + i); crawler.AddStep(add, 0); ReviewCrawler.MainForm.appendLineToLog(add.AbsoluteUri); } } } } } foreach (var r in reviews) { //title (may be null?) var review_title = r.InnerText; if (!string.IsNullOrWhiteSpace(review_title)) { review_title = HttpUtility.HtmlDecode(review_title); } else { review_title = ""; } //Console.WriteLine("TITLE : " + HttpUtility.HtmlDecode(title)); ReviewCrawler.MainForm.appendLineToLog("TITLE : " + review_title); //rating img (can definitely be null) string ratingString = ""; var ratingNode = r.ParentNode.SelectSingleNode("./img"); if (ratingNode != null) { ratingString = HttpUtility.HtmlDecode(ratingNode.GetAttributeValue("alt", "")); ratingString += " stars"; } //Console.WriteLine("RATING (text) : " + HttpUtility.HtmlDecode(ratingString)); ReviewCrawler.MainForm.appendLineToLog("RATING : " + ratingString); //author name and url (may be null?) string authorName = ""; string authorUrl = ""; var authorNode = r.ParentNode.SelectSingleNode("./a[2]"); if (authorNode != null) { authorName = HttpUtility.HtmlDecode(authorNode.InnerText); authorUrl = HttpUtility.HtmlDecode(authorNode.GetAttributeValue("href", "")); } //Console.WriteLine("AUTHOR NAME : " + HttpUtility.HtmlDecode(authorName)); //Console.WriteLine("AUTHOR URL : " + HttpUtility.HtmlDecode(authorUrl)); ReviewCrawler.MainForm.appendLineToLog("AUTHOR NAME : " + authorName); ReviewCrawler.MainForm.appendLineToLog("AUTHOR URL : " + authorUrl); //review date (may be null) //location (may be null) string dateString = ""; string locationString = ""; var dateNode = r.ParentNode.SelectSingleNode("./small[3]"); var location = r.ParentNode.SelectSingleNode("./small[2]"); if (dateNode == null) //this happens if the author does not have a location { dateNode = r.ParentNode.SelectSingleNode("./small[2]"); location = null; } if (dateNode != null) { DateTime date; try { DateTime.TryParse(HttpUtility.HtmlDecode(dateNode.InnerText), out date); dateString = date.ToShortDateString(); } catch (Exception) { /* ignore :( */ } } if (location != null) { locationString = HttpUtility.HtmlDecode(location.InnerText); } //Console.WriteLine("DATE : " + HttpUtility.HtmlDecode(dateString)); //Console.WriteLine("LOCATION : " + HttpUtility.HtmlDecode(locationString)); ReviewCrawler.MainForm.appendLineToLog("DATE : " + dateString); ReviewCrawler.MainForm.appendLineToLog("LOCATION : " + locationString); //usefulness (may be null) string usefulness = ""; var usefulnessNode = r.ParentNode.SelectSingleNode("./small"); if (usefulnessNode != null && usefulnessNode.InnerText.EndsWith("following review useful:")) { usefulness = HttpUtility.HtmlDecode(usefulnessNode.InnerText); } //Console.WriteLine("USEFULNESS : " + HttpUtility.HtmlDecode(usefulness)); ReviewCrawler.MainForm.appendLineToLog("USEFULNESS : " + usefulness); //Review text var reviewText = r.ParentNode.NextSibling.NextSibling.InnerText; if (!String.IsNullOrWhiteSpace(reviewText)) { //Console.WriteLine("REVIEW TEXT : " + HttpUtility.HtmlDecode(reviewText.Replace("\n", " ")).Substring(0, 200) + " ..."); reviewText = HttpUtility.HtmlDecode(reviewText.Replace("\n", " ").Replace("\r", " ").Replace("\t", " ")); ReviewCrawler.MainForm.appendLineToLog("REVIEW TEXT : " + reviewText.Substring(0, reviewText.Length / 10) + " ..."); } else { reviewText = ""; } string gender = "gender"; string age_min = "age_min"; string age_max = "age_max"; gender = CrawlUtil.getQueryValueFromUrl(propertyBag.ResponseUri.AbsoluteUri, gender); age_min = CrawlUtil.getQueryValueFromUrl(propertyBag.ResponseUri.AbsoluteUri, age_min); age_max = CrawlUtil.getQueryValueFromUrl(propertyBag.ResponseUri.AbsoluteUri, age_max); ReviewCrawler.MainForm.appendLineToLog("GENDER : " + gender); ReviewCrawler.MainForm.appendLineToLog("AGE MIN : " + age_min); ReviewCrawler.MainForm.appendLineToLog("AGE MAX : " + age_max); string movie_title = CrawlUtil.getMovieNameFromTitle(HttpUtility.HtmlDecode(propertyBag.Title)); var tsv = new StringBuilder(); tsv.AppendFormat("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}" + Environment.NewLine, movie_title, //0 review_title, //1 ratingString, //2 dateString, //3 authorName, //4 authorUrl, //5 locationString, //6 usefulness, //7 reviewText, //8 gender, //9 age_min, //10 age_max //11 ); try { File.AppendAllText(ReviewCrawler.MainForm.SaveFileName, tsv.ToString()); } catch (Exception ex) { ReviewCrawler.MainForm.appendLineToLog(ex.Message); ReviewCrawler.MainForm.appendLineToLog(ex.StackTrace); } } }