C# (CSharp) Crawler.AddStep Examples

Programming Language: C# (CSharp)

Class/Type: Crawler

Method/Function: AddStep

Examples at hotexamples.com: 14

C# (CSharp) Crawler.AddStep - 14 examples found. These are the top rated real world C# (CSharp) examples of Crawler.AddStep from package ChiakiYu extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Crawl(30)

AddStep(11)

CrawlAsync(6)

ChangeState(4)

Backtrack(3)

Craw(3)

ClearUncrawledCrawlRequests(2)

ClearDiscoveries(2)

BeginCrawling(2)

CrawlConverterTypes(2)

CrawlAll(1)

CrawlData(1)

CrawlMSE(1)

CrawlListPublisher(1)

CrawlMHS(1)

CrawlOnce(1)

CrawlParttime(1)

CrawlPic(1)

ExtractEpisodes(1)

GetBoxScore_W(1)

GetSituation_W(1)

CrawMethods(1)

AcessaTjmg(1)

ContinueWith(1)

BuildSiteMap(1)

AddPipeline(1)

AddStartUrl(1)

AddWork(1)

BfsCraw(1)

Browse(1)

Build(1)

Cancel(1)

Contains(1)

CarryOn(1)

CheckHtml(1)

CheckPageLinks(1)

ClearDisallowedAbsoluteUris(1)

ClearPolitenesses(1)

AddChild(1)

ConstructMediaWikiURLs(1)

Internet(1)

Crawler Class Documentation

Example #1

Show file

 public void Process(Crawler crawler, PropertyBag propertyBag)
 {
     foreach (Uri uri in seeds)
     {
         crawler.AddStep(uri, 2);
     }
 }

Example #2

Show file

File: HtmlDocumentProcessor.cs Project: efocus-nl/sitecorewebsearch

 protected virtual void AddStepToCrawler(Crawler crawler, PropertyBag propertyBag, string normalizedLink, string link)
 {
     crawler.AddStep(new Uri(normalizedLink), propertyBag.Step.Depth + 1,
                     propertyBag.Step, new Dictionary <string, object>
     {
         { Resources.PropertyBagKeyOriginalUrl, link },
         { Resources.PropertyBagKeyOriginalReferrerUrl, propertyBag.ResponseUri }
     });
 }

Example #3

Show file

File: SitemapProcessor.cs Project: osamede/social_listen

        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            if (propertyBag.StatusCode != HttpStatusCode.OK)
            {
                return;
            }

            if (!IsXmlContent(propertyBag.ContentType))
            {
                return;
            }

            using (Stream reader = propertyBag.GetResponse())
            using (StreamReader sr = new StreamReader(reader))
            {
                XDocument mydoc = XDocument.Load(sr);
                if (mydoc.Root == null)
                {
                    return;
                }

                XName qualifiedName = XName.Get("loc", "http://www.sitemaps.org/schemas/sitemap/0.9");
                IEnumerable<string> urlNodes =
                    from e in mydoc.Descendants(qualifiedName)
                    where !e.Value.IsNullOrEmpty() && e.Value.StartsWith("http://", StringComparison.OrdinalIgnoreCase)
                    select e.Value;

                foreach (string url in urlNodes)
                {
                    // add new crawler steps
                    string baseUrl = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path);
                    string decodedLink = ExtendedHtmlUtility.HtmlEntityDecode(url);
                    string normalizedLink = NormalizeLink(baseUrl, decodedLink);

                    if (normalizedLink.IsNullOrEmpty())
                    {
                        continue;
                    }

                    crawler.AddStep(new Uri(normalizedLink), propertyBag.Step.Depth + 1,
                        propertyBag.Step, new Dictionary<string, object>
                            {
                                {Resources.PropertyBagKeyOriginalUrl, url},
                                {Resources.PropertyBagKeyOriginalReferrerUrl, propertyBag.ResponseUri}
                            });
                }
            }
        }

Example #4

Show file

        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            if (propertyBag.StatusCode != HttpStatusCode.OK)
            {
                return;
            }

            if (!IsXmlContent(propertyBag.ContentType))
            {
                return;
            }

            using (Stream reader = propertyBag.GetResponse())
                using (StreamReader sr = new StreamReader(reader))
                {
                    XDocument mydoc = XDocument.Load(sr);
                    if (mydoc.Root == null)
                    {
                        return;
                    }

                    XName qualifiedName           = XName.Get("loc", "http://www.sitemaps.org/schemas/sitemap/0.9");
                    IEnumerable <string> urlNodes =
                        from e in mydoc.Descendants(qualifiedName)
                        where !e.Value.IsNullOrEmpty() && e.Value.StartsWith("http://", StringComparison.OrdinalIgnoreCase)
                        select e.Value;

                    foreach (string url in urlNodes)
                    {
                        // add new crawler steps
                        string baseUrl        = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path);
                        string decodedLink    = ExtendedHtmlUtility.HtmlEntityDecode(url);
                        string normalizedLink = NormalizeLink(baseUrl, decodedLink);

                        if (normalizedLink.IsNullOrEmpty())
                        {
                            continue;
                        }

                        crawler.AddStep(new Uri(normalizedLink), propertyBag.Step.Depth + 1,
                                        propertyBag.Step, new Dictionary <string, object>
                        {
                            { Resources.PropertyBagKeyOriginalUrl, url },
                            { Resources.PropertyBagKeyOriginalReferrerUrl, propertyBag.ResponseUri }
                        });
                    }
                }
        }

Example #5

Show file

File: LinkExtractionProcessor.cs Project: jhurlburt/BiggWhale

        public virtual void Process(Crawler crawler, PropertyBag propertyBag)
        {
            // Get text from previous pipeline step
            string text = propertyBag.Text;

            if (HasTextStripRules)
            {
                text = StripText(text);
            }

            if (text.IsNullOrEmpty())
            {
                return;
            }

            if (HasLinkStripRules)
            {
                text = StripLinks(text);
            }

            // Find links
            MatchCollection matches = s_LinkRegex.Value.Matches(text);

            foreach (Match match in matches.Cast <Match>().Where(m => m.Success))
            {
                string link = match.Value;
                if (link.IsNullOrEmpty())
                {
                    continue;
                }

                string baseUrl        = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path);
                string normalizedLink = link.NormalizeUrl(baseUrl);
                if (normalizedLink.IsNullOrEmpty())
                {
                    continue;
                }

                // Add new step to crawler
                crawler.AddStep(new Uri(normalizedLink), propertyBag.Step.Depth + 1,
                                propertyBag.Step, new Dictionary <string, object>
                {
                    { Resources.PropertyBagKeyOriginalUrl, new Uri(link) },
                    { Resources.PropertyBagKeyOriginalReferrerUrl, propertyBag.ResponseUri }
                });
            }
        }

Example #6

Show file

File: LinkExtractionProcessor.cs Project: fzhenmei/study

        public virtual void Process(Crawler crawler, PropertyBag propertyBag)
        {
            // Get text from previous pipeline step
            string text = propertyBag.Text;
            if (HasTextStripRules)
            {
                text = StripText(text);
            }

            if (text.IsNullOrEmpty())
            {
                return;
            }

            if (HasLinkStripRules)
            {
                text = StripLinks(text);
            }

            // Find links
            MatchCollection matches = s_LinkRegex.Value.Matches(text);
            foreach (Match match in matches.Cast<Match>().Where(m => m.Success))
            {
                string link = match.Value;
                if (link.IsNullOrEmpty())
                {
                    continue;
                }

                string baseUrl = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path);
                string normalizedLink = link.NormalizeUri(baseUrl);
                if (normalizedLink.IsNullOrEmpty())
                {
                    continue;
                }

                // Add new step to crawler
                crawler.AddStep(new Uri(normalizedLink), propertyBag.Step.Depth + 1,
                    propertyBag.Step, new Dictionary<string, object>
                        {
                            {Resources.PropertyBagKeyOriginalUrl, new Uri(link)},
                            {Resources.PropertyBagKeyOriginalReferrerUrl, propertyBag.ResponseUri}
                        });
            }
        }

Example #7

Show file

File: CrawlReviewIFrame.cs Project: ron-t/breview-gr-crawler

        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument;

            if (htmlDoc == null)
            {
                return; //this happens with HTTP errors etc. We don't bother with retrying or anything like that :(
            }
            int maxPage = CrawlUtil.getMaxReviewIFramePageNumber(htmlDoc);

            //add other review pages if at base (uri) page
            if (propertyBag.ResponseUri.ToString() == CrawlReviewIFrame.baseUri && maxPage != -1)
            {
                int maxPageToCrawl = maxPage;

                string uri = "";
                //if (maxPage > 10)   commenting this out means to crawl all review pages.
                //{
                //    maxPageToCrawl = 10;
                //}

                for (int i = 2; i <= maxPageToCrawl; i++)
                {
                    uri = "http://www.goodreads.com/api/reviews_widget_iframe?did=DEVELOPER_ID&amp;format=html&amp;isbn=" + CrawlReviewIFrame.currentBook.isbn + "&amp;links=660&amp;min_rating=&amp;page=" + i + "&amp;review_back=fff&amp;stars=000&amp;text=000";
                    crawler.AddStep(new Uri(uri), 0);
                }


                CrawlReviewIFrame.form.appendLineToLog("Crawling " + maxPageToCrawl + " pages of reviews for " + CrawlReviewIFrame.currentBook.getShortTitle());
            }

            //only process review iframe pages
            if (!propertyBag.ResponseUri.OriginalString.StartsWith(CrawlReviewIFrame.baseUri.Substring(0, 100)))
            {
                return;
            }

            lock (this)
            {
                string currentPage     = "0";
                var    currentPageNode = htmlDoc.DocumentNode.SelectSingleNode("//*[@class='current']");
                if (currentPageNode != null)
                {
                    currentPage = currentPageNode.InnerText.Trim();
                }

                var reviews = htmlDoc.DocumentNode.SelectNodes("//*[@itemtype='http://schema.org/Review']");

                if (reviews == null || reviews.Count == 0)
                {
                    return;
                }



                //**do stuff to handle dupes properly
                //                           -_-
                //current method just saves each review one by one and ignores all errors when trying to save.
                //this also means all reviews are attempted to be saved again no matter what :(
                GoodReadsCrawlerEntities context = CrawlUtil.getNewContext();

                foreach (var r in reviews)
                {
                    string   reviewUrl;
                    int      reviewId = -1;
                    Match    match;
                    var      reviewLinkNode = r.SelectSingleNode(".//div[@class='gr_review_text']/link[@itemprop='url']");
                    DateTime publishDate    = DateTime.MinValue;
                    short    starRating     = 0;

                    Review toAdd = new Review();
                    if (reviewLinkNode != null)
                    {
                        reviewUrl = reviewLinkNode.GetAttributeValue("href", "null");
                        match     = regReview.Match(reviewUrl);

                        if (Int32.TryParse(match.Groups[1].Value, out reviewId))
                        {
                            if (CrawlReviewIFrame.existingReviewIds.Contains(reviewId))
                            {
                                continue;
                            }

                            var node = r.SelectSingleNode(".//span[@class='gr_review_date']");
                            if (node != null)
                            {
                                DateTime.TryParse(node.InnerText, out publishDate);
                            }

                            node = r.SelectSingleNode(".//span[@class='gr_rating']");
                            if (node != null)
                            {
                                starRating = CrawlUtil.countStarsFromString(node.InnerText);
                            }

                            toAdd.id          = reviewId;
                            toAdd.bookId      = CrawlReviewIFrame.currentBook.id;
                            toAdd.publishDate = publishDate;
                            toAdd.starRating  = starRating;
                            toAdd.foundOnPage = Int32.Parse(currentPage);
                            toAdd.maxPage     = maxPage;

                            context.Reviews.AddObject(toAdd);
                        }

                        try
                        {
                            context.SaveChanges();
                        }
                        catch (Exception ex)
                        {
                            context.Reviews.Detach(toAdd);

                            CrawlReviewIFrame.form.appendLineToLog(ex.Message);
                            if (ex.InnerException != null)
                            {
                                CrawlReviewIFrame.form.appendLineToLog("\t" + ex.InnerException.Message);
                            }
                        }
                    }
                }

                CrawlReviewIFrame.form.appendLineToLog("Added " + reviews.Count + " on page " + currentPage + " of " + maxPage + " for " + CrawlReviewIFrame.currentBook.getShortTitle());
            }
        }

Example #8

Show file

        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            //add pages only if at base uri page
            if (propertyBag.ResponseUri.ToString() == "http://www.goodreads.com/list/show/1.Best_Books_Ever?page=" + CrawlList.FromPage)
            {
                string uri = "";

                for (int i = CrawlList.FromPage + 1; i <= CrawlList.ToPage; i++)
                {
                    uri = "http://www.goodreads.com/list/show/1.Best_Books_Ever?page=" + i;
                    crawler.AddStep(new Uri(uri), 0);
                    CrawlList.form.appendLineToLog("also crawling " + uri);
                }
            }

            //only process list pages
            if (!propertyBag.ResponseUri.OriginalString.StartsWith("http://www.goodreads.com/list/show/1.Best_Books_Ever"))
            {
                return;
            }

            var s = propertyBag["HtmlDoc"].Value;

            HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument;

            if (htmlDoc != null)
            {
                lock (this)
                {
                    var books = htmlDoc.DocumentNode.SelectNodes("//tr[@itemtype='http://schema.org/Book\']");

                    if (books == null || books.Count == 0)
                    {
                        return;
                    }

                    GoodReadsCrawlerEntities context = CrawlUtil.getNewContext();
                    foreach (var b in books)
                    {
                        string title         = "null";
                        string authorName    = "null";
                        var    titleURLNode  = b.SelectSingleNode(".//*[@class='bookTitle']");
                        var    authorURLNode = b.SelectSingleNode(".//*[@class='authorName']");
                        string titleUrl      = "null";
                        string authorUrl     = "null";
                        Match  match;
                        string bookId   = "-1";
                        string authorId = "-1";
                        Book   newBook  = null;
                        Author author   = null;


                        if (titleURLNode != null && authorURLNode != null)
                        {
                            titleUrl = titleURLNode.GetAttributeValue("href", "null");
                            match    = regBook.Match(titleUrl);
                            bookId   = match.Groups[1].Value;
                            title    = titleURLNode.InnerText.Trim();

                            authorUrl  = authorURLNode.GetAttributeValue("href", "null");
                            match      = regAuthor.Match(authorUrl);
                            authorId   = match.Groups[1].Value;
                            authorName = authorURLNode.InnerText.Trim();

                            author  = CrawlUtil.createOrGetAuthor(context, Int32.Parse(authorId), authorName);
                            newBook = CrawlUtil.createOrGetBook(context, Int32.Parse(bookId), title);

                            newBook.Author = author;
                            //author.Book = newBook;
                        }

                        context.SaveChanges();
                    }

                    CrawlList.form.appendLineToLog("added/updated " + books.Count + " books and their authors");
                }
            }
        }

Example #9

Show file

File: CrawlListAndVotes.cs Project: ron-t/breview-gr-crawler

        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument;

            if (htmlDoc == null)
            {
                return; //this happens with HTTP errors etc. We don't bother with retrying or anything like that :(
            }

            HtmlNode doc = htmlDoc.DocumentNode;

            //on page 1, add other pages for user
            if (propertyBag.ResponseUri.OriginalString == CrawlListAndVotes.baseUri)
            {
/*
 *
 * <div>
 * <span class="previous_page disabled">&laquo; previous</span>
 * <em class="current">1</em>
 * <a rel="next" href="/list/user_votes/1045275-natasha?page=2">2</a>
 * <a href="/list/user_votes/1045275-natasha?page=3">3</a>
 * <a href="/list/user_votes/1045275-natasha?page=4">4</a>
 * <a href="/list/user_votes/1045275-natasha?page=5">5</a>
 * <a class="next_page" rel="next" href="/list/user_votes/1045275-natasha?page=2">next &raquo;</a>
 * </div>
 */
                var node = doc.SelectSingleNode(".//a[@class='next_page' and @rel='next']");

                if (node != null)
                {
                    try
                    {
                        var x       = node.PreviousSibling.PreviousSibling;
                        int maxPage = Int32.Parse(x.InnerText.Trim());

                        string uri;
                        for (int i = 2; i <= maxPage; i++)
                        {
                            uri = "http://www.goodreads.com/list/user_votes/" + User.userIdString + "?page=" + i;
                            crawler.AddStep(new Uri(uri), 0);

                            CrawlListAndVotes.form.appendLineToLog(uri);
                        }
                    }
                    catch (Exception ex)
                    {
                        CrawlListAndVotes.form.appendLineToLog(ex.Message);
                    }
                }
            }

            lock (this)
            {
                GoodReadsCrawlerEntities context = CrawlUtil.getNewContext();

                foreach (var listNode in doc.SelectNodes(".//div[@class='cell']"))
                {
                    List   l         = null;
                    string title     = null;
                    var    titleNode = listNode.SelectSingleNode(".//a[@class='listTitle']");
                    if (titleNode != null)
                    {
                        title = titleNode.InnerText.Trim();
                    }

                    if (title != null)
                    {
                        l = CrawlUtil.createOrGetList(context, title);
                    }
                    else
                    {
                        continue;
                    }

/*
 *  296 books
 *  &mdash;
 *  994 voters
 */
                    var statsNode = listNode.SelectSingleNode(".//div[@class='listFullDetails']");
                    if (statsNode != null)
                    {
                        string s = statsNode.InnerText.Replace("\n", "").Trim();
                        l.numBooks = Convert.ToInt32(CrawlUtil.extractNumberFromString(s));

                        s           = s.Substring(s.IndexOf("books"));
                        l.numVoters = Convert.ToInt32(CrawlUtil.extractNumberFromString(s));
                    }

                    User u = CrawlUtil.getUser(context, User.id);
                    u.Lists.Add(l);

                    try
                    {
                        context.SaveChanges();
                        CrawlListAndVotes.count++;
                    }
                    catch (Exception ex)
                    {
                        User.Lists.Remove(l);
                        //this just prints out to check an inner exception which is a dupe PK error
                        //CrawlListAndVotes.form.appendLineToLog(ex.Message);
                    }
                }

                CrawlListAndVotes.form.appendLineToLog(User.userIdString + ":: " + CrawlListAndVotes.count + " lists added");
            }
        }

Example #10

Show file

File: HtmlDocumentProcessor.cs Project: fzhenmei/study

        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            AspectF.Define.
                NotNull(crawler, "crawler").
                NotNull(propertyBag, "propertyBag");

            if (propertyBag.StatusCode != HttpStatusCode.OK)
            {
                return;
            }

            if (!IsHtmlContent(propertyBag.ContentType))
            {
                return;
            }

            HtmlDocument htmlDoc = new HtmlDocument
                {
                    OptionAddDebuggingAttributes = false,
                    OptionAutoCloseOnEnd = true,
                    OptionFixNestedTags = true,
                    OptionReadEncoding = true
                };
            using (Stream reader = propertyBag.GetResponse())
            {
                Encoding documentEncoding = htmlDoc.DetectEncoding(reader);
                reader.Seek(0, SeekOrigin.Begin);
                if (!documentEncoding.IsNull())
                {
                    htmlDoc.Load(reader, documentEncoding, true);
                }
                else
                {
                    htmlDoc.Load(reader, true);
                }
            }

            string originalContent = htmlDoc.DocumentNode.OuterHtml;
            if (HasTextStripRules || HasSubstitutionRules)
            {
                string content = StripText(originalContent);
                content = Substitute(content, propertyBag.Step);
                using (TextReader tr = new StringReader(content))
                {
                    htmlDoc.Load(tr);
                }
            }

            propertyBag["HtmlDoc"].Value = htmlDoc;

            HtmlNodeCollection nodes = htmlDoc.DocumentNode.SelectNodes("//title");
            // Extract Title
            if (!nodes.IsNull())
            {
                propertyBag.Title = string.Join(";", nodes.
                    Select(n => n.InnerText).
                    ToArray()).Trim();
            }

            // Extract Meta Data
            nodes = htmlDoc.DocumentNode.SelectNodes("//meta[@content and @name]");
            if (!nodes.IsNull())
            {
                propertyBag["Meta"].Value = (
                    from entry in nodes
                    let name = entry.Attributes["name"]
                    let content = entry.Attributes["content"]
                    where !name.IsNull() && !name.Value.IsNullOrEmpty() && !content.IsNull() && !content.Value.IsNullOrEmpty()
                    select name.Value + ": " + content.Value).ToArray();
            }

            propertyBag.Text = htmlDoc.ExtractText().Trim();
            if (HasLinkStripRules || HasTextStripRules)
            {
                string content = StripLinks(originalContent);
                using (TextReader tr = new StringReader(content))
                {
                    htmlDoc.Load(tr);
                }
            }

            // Extract Links
            DocumentWithLinks links = htmlDoc.GetLinks();
            foreach (string link in links.Links.Union(links.References))
            {
                if (link.IsNullOrEmpty())
                {
                    continue;
                }

                string baseUrl = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path);
                string decodedLink = ExtendedHtmlUtility.HtmlEntityDecode(link);
                string normalizedLink = NormalizeLink(baseUrl, decodedLink);
                if (normalizedLink.IsNullOrEmpty())
                {
                    continue;
                }

                crawler.AddStep(new Uri(normalizedLink), propertyBag.Step.Depth + 1,
                    propertyBag.Step, new Dictionary<string, object>
                        {
                            {Resources.PropertyBagKeyOriginalUrl, link},
                            {Resources.PropertyBagKeyOriginalReferrerUrl, propertyBag.ResponseUri}
                        });
            }
        }

Example #11

Show file

        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            AspectF.Define.
            NotNull(crawler, "crawler").
            NotNull(propertyBag, "propertyBag");

            if (propertyBag.StatusCode != HttpStatusCode.OK)
            {
                return;
            }

            if (!IsHtmlContent(propertyBag.ContentType))
            {
                return;
            }

            HtmlDocument htmlDoc = new HtmlDocument
            {
                OptionAddDebuggingAttributes = false,
                OptionAutoCloseOnEnd         = true,
                OptionFixNestedTags          = true,
                OptionReadEncoding           = true
            };

            using (MemoryStream reader = propertyBag.GetResponseStream())
            {
                Encoding documentEncoding = htmlDoc.DetectEncoding(reader);
                reader.Seek(0, SeekOrigin.Begin);
                if (!documentEncoding.IsNull())
                {
                    htmlDoc.Load(reader, documentEncoding, true);
                }
                else
                {
                    htmlDoc.Load(reader, true);
                }
            }

            string originalContent = htmlDoc.DocumentNode.OuterHtml;

            if (HasTextStripRules || HasSubstitutionRules)
            {
                string content = StripText(originalContent);
                content = Substitute(content, propertyBag.Step);
                using (TextReader tr = new StringReader(content))
                {
                    htmlDoc.Load(tr);
                }
            }

            HtmlNodeCollection nodes = htmlDoc.DocumentNode.SelectNodes("//title");

            // Extract Title
            if (!nodes.IsNull())
            {
                propertyBag.Title = string.Join(";", nodes.
                                                Select(n => n.InnerText).
                                                ToArray()).Trim();
            }

            // Extract Meta Data
            nodes = htmlDoc.DocumentNode.SelectNodes("//meta[@content and @name]");
            if (!nodes.IsNull())
            {
                propertyBag["Meta"].Value = (
                    from entry in nodes
                    let name = entry.Attributes["name"]
                               let content = entry.Attributes["content"]
                                             where !name.IsNull() && !name.Value.IsNullOrEmpty() && !content.IsNull() && !content.Value.IsNullOrEmpty()
                                             select name.Value + ": " + content.Value).ToArray();
            }

            propertyBag.Text = htmlDoc.ExtractText().Trim();
            if (HasLinkStripRules || HasTextStripRules)
            {
                string content = StripLinks(originalContent);
                using (TextReader tr = new StringReader(content))
                {
                    htmlDoc.Load(tr);
                }
            }

            // Extract Links
            DocumentWithLinks links = htmlDoc.GetLinks();

            foreach (string link in links.Links.Union(links.References))
            {
                if (link.IsNullOrEmpty())
                {
                    continue;
                }

                string baseUrl        = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path);
                string decodedLink    = ExtendedHtmlUtility.HtmlEntityDecode(link);
                string normalizedLink = NormalizeLink(baseUrl, decodedLink);
                if (normalizedLink.IsNullOrEmpty())
                {
                    continue;
                }

                crawler.AddStep(new Uri(normalizedLink), propertyBag.Step.Depth + 1,
                                propertyBag.Step, new Dictionary <string, object>
                {
                    { Resources.PropertyBagKeyOriginalUrl, link },
                    { Resources.PropertyBagKeyOriginalReferrerUrl, propertyBag.ResponseUri }
                });
            }
        }

Example #12

Show file

        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument;

            if (htmlDoc == null)
            {
                return; //this happens with HTTP errors etc. We don't bother with retrying or anything like that :(
            }

            HtmlNode doc = htmlDoc.DocumentNode;

            string temp  = propertyBag.ResponseUri.OriginalString;
            string temp2 = CrawlReviewsOnUserProfile.baseUri.Substring(0, 54 + User.userIdString.Length);

            //on page 1, add other pages for user
            if (propertyBag.ResponseUri.OriginalString == CrawlReviewsOnUserProfile.baseUri)
            {
                var node = doc.SelectSingleNode(".//div[@id='reviewPagination']");

                if (node != null)
                {
                    try
                    {
                        var x       = node.LastChild.PreviousSibling.PreviousSibling;
                        int maxPage = Int32.Parse(x.InnerText.Trim());

                        string uri;
                        for (int i = 2; i <= maxPage; i++)
                        {
                            uri = "http://www.goodreads.com/review/list/" + User.userIdString + "?format=html&page=" + i + "&per_page=100&shelf=read";
                            crawler.AddStep(new Uri(uri), 0);

                            CrawlReviewsOnUserProfile.form.appendLineToLog(uri);
                        }
                    }
                    catch (Exception)
                    {
                    }
                }
            }

            lock (this)
            {
                GoodReadsCrawlerEntities context = CrawlUtil.getNewContext();

                foreach (var reviewNode in doc.SelectNodes(".//tr[@class='bookalike review']"))
                {
                    CrawlReviewsOnUserProfile.count++;

                    string reviewIdString = reviewNode.GetAttributeValue("id", "");

                    if (reviewIdString == "")
                    {
                        return;
                    }

                    int reviewId = Int32.Parse(reviewIdString.Split('_').GetValue(1).ToString());

                    Review review = CrawlUtil.getReview(context, reviewId);

                    //create and process the REVIEW if it doesn't already exist
                    if (review == null)
                    {
                        HtmlNode node;
                        review = new Review();

                        review.id = reviewId;

                        //REVIEW.rating
                        node = reviewNode.SelectSingleNode(".//td[@class='field rating']//img");
                        if (node != null)
                        {
                            string ratingString = node.GetAttributeValue("alt", "0");
                            short  rating       = short.Parse(ratingString.Substring(0, 1));

                            review.starRating = rating;
                        }

                        //REVIEW.publishdate
                        node = reviewNode.SelectSingleNode(".//td[@class='field date_added']//span");
                        if (node != null)
                        {
                            DateTime date;
                            DateTime.TryParse(node.InnerText, out date);

                            review.publishDate = date;
                        }

                        //USER
                        review.userId       = User.id;
                        review.userIdString = User.userIdString;

                        //BOOK
                        node = reviewNode.SelectSingleNode(".//td[@class='field title']//a");
                        string bookUrl = node.GetAttributeValue("href", "");

                        int bookId = CrawlUtil.extractIdNumberFromUrl(bookUrl); //if bookUrl is null then bookId gets set to 0

                        Book book = CrawlUtil.getBook(context, bookId);

                        if (book == null)
                        {
                            book    = new Book();
                            book.id = bookId;

                            string title = node.GetAttributeValue("title", "");
                            book.title = title;

                            node = reviewNode.SelectSingleNode(".//td[@class='field isbn']//div");
                            if (node != null)
                            {
                                book.isbn = node.InnerText.Trim();
                            }

                            //AUTHOR
                            node = reviewNode.SelectSingleNode(".//td[@class='field author']//a");
                            if (node != null)
                            {
                                string authorUrl = node.GetAttributeValue("href", "");

                                int authorId = CrawlUtil.extractIdNumberFromUrl(authorUrl); //if bookUrl is null then bookId gets set to 0

                                Author author = CrawlUtil.getAuthor(context, authorId);

                                if (author == null)
                                {
                                    author    = new Author();
                                    author.id = authorId;

                                    author.name = CrawlUtil.formatAuthorName(node.InnerText.Trim());

                                    book.Author = author;
                                }
                            }
                        }

                        review.Book = book;

                        context.SaveChanges();
                    }
                }

                CrawlReviewsOnUserProfile.form.appendLineToLog(User.userIdString + ":: " + CrawlReviewsOnUserProfile.count + " reviews crawled");
            }
        }

Example #13

Show file

File: CrawlReviewsOnUserProfile_Updated.cs Project: ron-t/breview-gr-crawler

        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            HtmlDocument htmlDoc = propertyBag["HtmlDoc"].Value as HtmlDocument;

            if (htmlDoc == null)
            {
                return; //this happens with HTTP errors etc. We don't bother with retrying or anything like that :(
            }

            HtmlNode doc = htmlDoc.DocumentNode;

            string temp  = propertyBag.ResponseUri.OriginalString;
            string temp2 = CrawlReviewsOnUserProfile_Updated.baseUri.Substring(0, 54 + User.userIdString.Length);

            //on page 1, add other pages for user
            if (propertyBag.ResponseUri.OriginalString == CrawlReviewsOnUserProfile_Updated.baseUri)
            {
                var node = doc.SelectSingleNode(".//div[@id='reviewPagination']");

                if (node != null)
                {
                    HtmlNode maxPageNode;
                    int      maxPage = 0;

                    try
                    {
                        maxPageNode = node.LastChild.PreviousSibling.PreviousSibling;
                        maxPage     = Int32.Parse(maxPageNode.InnerText.Trim());
                    }
                    catch (Exception)
                    {
                        CrawlReviewsOnUserProfile_Updated.form.appendLineToLog("Error getting maxPage on " + propertyBag.ResponseUri.OriginalString);
                        return;
                    }

                    //get new reviews since last crawl?
                    int pagesToCrawl = 0;
                    if (CrawlReviewsOnUserProfile_Updated.getReviewsSinceLastCrawl)
                    {
                        pagesToCrawl = maxPage - (User.Reviews.Count / 20); //int division results in truncation
                        if (pagesToCrawl < 1)
                        {
                            return;
                        }

                        /**** TEMP to get pages 30 and above (for users with more than 600 reviews (after getting up to 600 only on a previous run))****/
                        //for (int i = 30; i <= maxPage; i++)
                        //{
                        //    String s= "http://www.goodreads.com/review/list/" + User.userIdString + "?order=a&page=" + i + "&print=true&shelf=read&sort=date_added&view=reviews";
                        //    crawler.AddStep(new Uri(s), 0);

                        //    CrawlReviewsOnUserProfile_Updated.form.appendLineToLog(s);
                        //}
                        //return;

                        /*** Old logic pre 2015 11 30 ***
                         * int startPage = (User.Reviews.Count / 20)+1;
                         * string uri;
                         * for (int i = startPage; i <= maxPage; i++)
                         * {
                         *  uri = "http://www.goodreads.com/review/list/" + User.userIdString + "?order=a&page=" + i + "&print=true&shelf=read&sort=date_added&view=reviews";
                         *  crawler.AddStep(new Uri(uri), 0);
                         *
                         *  CrawlReviewsOnUserProfile_Updated.form.appendLineToLog(uri);
                         * }
                         * return;
                         * *************/
                    }
                    else //crawl every page
                    {
                        pagesToCrawl = maxPage;
                    }

                    string uri;
                    for (int i = 2; i <= pagesToCrawl; i++)
                    {
                        //http://www.goodreads.com/review/list/1-otis-chandler?page=3&print=true&shelf=read
                        uri = "http://www.goodreads.com/review/list/" + User.userIdString + "?page=" + i + "&print=true&shelf=read";
                        crawler.AddStep(new Uri(uri), 0);

                        CrawlReviewsOnUserProfile_Updated.form.appendLineToLog(uri);
                    }
                    //continue to with crawl on page 1 unless endPage is 0 (i.e. no pages need to be crawled)


                    //2015-11-30: getting X latest pages is now redudant since reviews are now always sorted by date added ascending.
                    //feature removed for 2015 update 5 crawl

                    //get reviews from specified pages; or latest X pages
                    //get user's latest X pages of reviews if user has more than (maxPage * 20) reviews; X determined by getHowManyLatestPages
                    //if (maxPage > CrawlReviewsOnUserProfile_Updated.maxPage)
                    //{
                    //    if (CrawlReviewsOnUserProfile_Updated.getHowManyLatestPages > 0)
                    //    {
                    //        int numLatestPages = Math.Min(CrawlReviewsOnUserProfile_Updated.getHowManyLatestPages, maxPage - CrawlReviewsOnUserProfile_Updated.maxPage);
                    //        for (int i = 0; i < numLatestPages; i++)
                    //        {
                    //            string uriLatest = "http://www.goodreads.com/review/list/" + User.userIdString + "?order=a&page=" + (maxPage - i) + "&print=true&shelf=read&sort=date_added&view=reviews";
                    //            crawler.AddStep(new Uri(uriLatest), 0);

                    //            CrawlReviewsOnUserProfile_Updated.form.appendLineToLog(uriLatest);
                    //        }
                    //    }

                    //    maxPage = CrawlReviewsOnUserProfile_Updated.maxPage;
                    //}

                    //string u;
                    //for (int i = CrawlReviewsOnUserProfile_Updated.minPage; i <= maxPage; i++)
                    //{
                    //    u = "http://www.goodreads.com/review/list/" + User.userIdString + "?order=a&page=" + i + "&print=true&shelf=read&sort=date_added&view=reviews";
                    //    crawler.AddStep(new Uri(u), 0);

                    //    CrawlReviewsOnUserProfile_Updated.form.appendLineToLog(u);
                    //}

                    //if don't want to include page 1 then don't crawl after adding other pages to crawl
                    //if (CrawlReviewsOnUserProfile_Updated.minPage > 1)
                    //{
                    //    return;
                    //}
                }
            }

            lock (this)
            {
                GoodReadsCrawlerEntities context = CrawlUtil.getNewContext();

                foreach (var reviewNode in doc.SelectNodes(".//tr[@class='bookalike review']"))
                {
                    string reviewIdString = reviewNode.GetAttributeValue("id", "");

                    if (reviewIdString == "")
                    {
                        return;
                    }

                    int reviewId = Int32.Parse(reviewIdString.Split('_').GetValue(1).ToString());

                    //Review review = CrawlUtil.createOrGetReview(context, reviewId);
                    Review review = CrawlUtil.getReview(context, reviewId);

                    if (review == null) //review is new
                    {
                        review    = new Review();
                        review.id = reviewId;

                        context.Reviews.AddObject(review);
                    }
                    else //review already exists
                    {
                        continue;
                    }

                    HtmlNode node;

                    //REVIEW.rating

                    /*<td class="field rating">
                     *  <label>Reb's rating</label>
                     *  <div class="value">
                     *      <a class=" staticStars stars_4" title="really liked it">4 of 5 stars</a>
                     *  </div>
                     * </td>*/
                    node = reviewNode.SelectSingleNode(".//td[@class='field rating']//a");
                    if (node != null)
                    {
                        string ratingClassString = node.GetAttributeValue("class", "0");
                        short  rating            = CrawlUtil.getRatingFromClassString(ratingClassString);

                        review.starRating = rating;
                    }

                    //REVIEW.publishdate
                    node = reviewNode.SelectSingleNode(".//td[@class='field date_added']//span");
                    if (node != null)
                    {
                        DateTime date;
                        DateTime.TryParse(node.InnerText, out date);

                        review.publishDate = date;
                    }

                    //USER
                    review.userId       = User.id;
                    review.userIdString = User.userIdString;

                    //BOOK
                    node = reviewNode.SelectSingleNode(".//td[@class='field title']//a");
                    string bookUrl = node.GetAttributeValue("href", "");

                    int bookId = CrawlUtil.extractIdNumberFromUrl(bookUrl); //if bookUrl is null then bookId gets set to 0

                    Book book = CrawlUtil.getBook(context, bookId);

                    if (book == null)
                    {
                        book    = new Book();
                        book.id = bookId;

                        string title = node.GetAttributeValue("title", "");
                        book.title = title;

                        node = reviewNode.SelectSingleNode(".//td[@class='field isbn']//div");
                        if (node != null)
                        {
                            book.isbn = node.InnerText.Trim();
                        }

                        //AUTHOR
                        node = reviewNode.SelectSingleNode(".//td[@class='field author']//a");
                        if (node != null)
                        {
                            string authorUrl = node.GetAttributeValue("href", "");

                            int authorId = CrawlUtil.extractIdNumberFromUrl(authorUrl); //if bookUrl is null then bookId gets set to 0

                            Author author = CrawlUtil.getAuthor(context, authorId);

                            if (author == null)
                            {
                                author    = new Author();
                                author.id = authorId;

                                author.name = CrawlUtil.formatAuthorName(node.InnerText.Trim());

                                book.Author = author;
                            }
                        }
                    }

                    review.Book = book;

                    context.SaveChanges();


                    CrawlReviewsOnUserProfile_Updated.count++;
                }

                CrawlReviewsOnUserProfile_Updated.form.appendLineToLog(User.userIdString + ":: " + CrawlReviewsOnUserProfile_Updated.count + " reviews crawled");
            }
        }

Example #14

Show file

        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            HtmlAgilityPack.HtmlDocument doc = propertyBag["HtmlDoc"].Value as HtmlAgilityPack.HtmlDocument;

            if (doc == null)
            {
                return;
            }

            var reviews = doc.DocumentNode.SelectNodes("//h2");

            if (reviews == null)
            {
                return;
            }

            //On first page for each gender-age-group queue up other pages for the same group
            //First page is where ?start=<blank>
            if (CrawlUtil.getQueryValueFromUrl(propertyBag.ResponseUri.AbsoluteUri, "start") == null)
            {
                var matchingReviewsNode = doc.DocumentNode.SelectSingleNode("//td[@align='right']");
                if (matchingReviewsNode != null)
                {
                    //89 matching reviews (334 reviews in total)&nbsp;
                    Regex r = new Regex(@"(\d+) (matching reviews|reviews in total)");
                    Match m = r.Match(matchingReviewsNode.InnerText);

                    if (m != null)
                    {
                        int matchingReviews = 0;

                        int.TryParse(m.Groups[1].Value, out matchingReviews);

                        if (matchingReviews > 10)
                        {
                            for (int i = 10; i < matchingReviews; i += 10)
                            {
                                Uri add = new Uri(propertyBag.ResponseUri.AbsoluteUri + "&start=" + i);
                                crawler.AddStep(add, 0);
                                ReviewCrawler.MainForm.appendLineToLog(add.AbsoluteUri);
                            }
                        }
                    }
                }
            }

            foreach (var r in reviews)
            {
                //title (may be null?)
                var review_title = r.InnerText;
                if (!string.IsNullOrWhiteSpace(review_title))
                {
                    review_title = HttpUtility.HtmlDecode(review_title);
                }
                else
                {
                    review_title = "";
                }

                //Console.WriteLine("TITLE : " + HttpUtility.HtmlDecode(title));
                ReviewCrawler.MainForm.appendLineToLog("TITLE : " + review_title);

                //rating img (can definitely be null)
                string ratingString = "";
                var    ratingNode   = r.ParentNode.SelectSingleNode("./img");
                if (ratingNode != null)
                {
                    ratingString  = HttpUtility.HtmlDecode(ratingNode.GetAttributeValue("alt", ""));
                    ratingString += " stars";
                }
                //Console.WriteLine("RATING (text) : " + HttpUtility.HtmlDecode(ratingString));
                ReviewCrawler.MainForm.appendLineToLog("RATING : " + ratingString);

                //author name and url (may be null?)
                string authorName = "";
                string authorUrl  = "";
                var    authorNode = r.ParentNode.SelectSingleNode("./a[2]");
                if (authorNode != null)
                {
                    authorName = HttpUtility.HtmlDecode(authorNode.InnerText);
                    authorUrl  = HttpUtility.HtmlDecode(authorNode.GetAttributeValue("href", ""));
                }
                //Console.WriteLine("AUTHOR NAME : " + HttpUtility.HtmlDecode(authorName));
                //Console.WriteLine("AUTHOR URL : " + HttpUtility.HtmlDecode(authorUrl));
                ReviewCrawler.MainForm.appendLineToLog("AUTHOR NAME : " + authorName);
                ReviewCrawler.MainForm.appendLineToLog("AUTHOR URL : " + authorUrl);

                //review date (may be null)
                //location (may be null)
                string dateString     = "";
                string locationString = "";
                var    dateNode       = r.ParentNode.SelectSingleNode("./small[3]");
                var    location       = r.ParentNode.SelectSingleNode("./small[2]");

                if (dateNode == null) //this happens if the author does not have a location
                {
                    dateNode = r.ParentNode.SelectSingleNode("./small[2]");
                    location = null;
                }
                if (dateNode != null)
                {
                    DateTime date;
                    try
                    {
                        DateTime.TryParse(HttpUtility.HtmlDecode(dateNode.InnerText), out date);
                        dateString = date.ToShortDateString();
                    }
                    catch (Exception) { /* ignore :( */ }
                }

                if (location != null)
                {
                    locationString = HttpUtility.HtmlDecode(location.InnerText);
                }

                //Console.WriteLine("DATE : " + HttpUtility.HtmlDecode(dateString));
                //Console.WriteLine("LOCATION : " + HttpUtility.HtmlDecode(locationString));
                ReviewCrawler.MainForm.appendLineToLog("DATE : " + dateString);
                ReviewCrawler.MainForm.appendLineToLog("LOCATION : " + locationString);

                //usefulness (may be null)
                string usefulness     = "";
                var    usefulnessNode = r.ParentNode.SelectSingleNode("./small");
                if (usefulnessNode != null && usefulnessNode.InnerText.EndsWith("following review useful:"))
                {
                    usefulness = HttpUtility.HtmlDecode(usefulnessNode.InnerText);
                }
                //Console.WriteLine("USEFULNESS : " + HttpUtility.HtmlDecode(usefulness));
                ReviewCrawler.MainForm.appendLineToLog("USEFULNESS : " + usefulness);

                //Review text
                var reviewText = r.ParentNode.NextSibling.NextSibling.InnerText;
                if (!String.IsNullOrWhiteSpace(reviewText))
                {
                    //Console.WriteLine("REVIEW TEXT : " + HttpUtility.HtmlDecode(reviewText.Replace("\n", " ")).Substring(0, 200) + " ...");
                    reviewText = HttpUtility.HtmlDecode(reviewText.Replace("\n", " ").Replace("\r", " ").Replace("\t", " "));
                    ReviewCrawler.MainForm.appendLineToLog("REVIEW TEXT : " + reviewText.Substring(0, reviewText.Length / 10) + " ...");
                }
                else
                {
                    reviewText = "";
                }

                string gender  = "gender";
                string age_min = "age_min";
                string age_max = "age_max";

                gender  = CrawlUtil.getQueryValueFromUrl(propertyBag.ResponseUri.AbsoluteUri, gender);
                age_min = CrawlUtil.getQueryValueFromUrl(propertyBag.ResponseUri.AbsoluteUri, age_min);
                age_max = CrawlUtil.getQueryValueFromUrl(propertyBag.ResponseUri.AbsoluteUri, age_max);

                ReviewCrawler.MainForm.appendLineToLog("GENDER : " + gender);
                ReviewCrawler.MainForm.appendLineToLog("AGE MIN : " + age_min);
                ReviewCrawler.MainForm.appendLineToLog("AGE MAX : " + age_max);

                string movie_title = CrawlUtil.getMovieNameFromTitle(HttpUtility.HtmlDecode(propertyBag.Title));

                var tsv = new StringBuilder();

                tsv.AppendFormat("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}" + Environment.NewLine,
                                 movie_title,    //0
                                 review_title,   //1
                                 ratingString,   //2
                                 dateString,     //3
                                 authorName,     //4
                                 authorUrl,      //5
                                 locationString, //6
                                 usefulness,     //7
                                 reviewText,     //8
                                 gender,         //9
                                 age_min,        //10
                                 age_max         //11
                                 );
                try
                {
                    File.AppendAllText(ReviewCrawler.MainForm.SaveFileName, tsv.ToString());
                }
                catch (Exception ex)
                {
                    ReviewCrawler.MainForm.appendLineToLog(ex.Message);
                    ReviewCrawler.MainForm.appendLineToLog(ex.StackTrace);
                }
            }
        }