private string GetArtistPosters(string url, string artistName, HtmlNode body) { string thumbnailPath = string.Empty; MovieCrawler movieCrawler = new MovieCrawler(); List <string> posters = movieCrawler.CrawlPosters(url, artistName, ref thumbnailPath); return(JsonConvert.SerializeObject(posters)); }
private string GetArtistPosters(string url, string artistName, HtmlNode body) { string thumbnailPath = string.Empty; MovieCrawler movieCrawler = new MovieCrawler(); List<string> posters = movieCrawler.CrawlPosters(url, artistName, ref thumbnailPath); return JsonConvert.SerializeObject(posters); }
private void CrawlfromXML(string xmlData, string movieName) { if (string.IsNullOrEmpty(xmlData)) return; Crawler.MovieCrawler movieCrawler = new Crawler.MovieCrawler(); JavaScriptSerializer json = new JavaScriptSerializer(); try { XmlDocument xdoc = new XmlDocument(); #region Movie Crawler xdoc.LoadXml(xmlData); var movies = xdoc.SelectNodes("Movies/Month/Movie"); if (movies == null) return; foreach (XmlNode movie in movies) { // Check movie name, we just need to crawl single movie and not all the movies present in XML file for current month if (movie.Attributes["name"].Value.ToLower() != movieName.ToLower()) { continue; } if (movie.Attributes["link"] != null && !string.IsNullOrEmpty(movie.Attributes["link"].Value)) { try { List<string> critics = new List<string>(); #region Crawl Movie MovieEntity mov = movieCrawler.Crawl(movie.Attributes["link"].Value); TableManager tblMgr = new TableManager(); // Save the crawled content because in case of new movies, it fails tblMgr.UpdateMovieById(mov); string posterUrl = string.Empty; if (movie.Attributes["santaposterlink"] != null && !string.IsNullOrEmpty(movie.Attributes["santaposterlink"].Value)) { XMLMovieProperties prop = new XMLMovieProperties(); prop.SantaPosterLink = movie.Attributes["santaposterlink"].Value; prop.MovieName = mov.UniqueName; CrawlPosters(json.Serialize(prop)); } // Crawl Songs from Saavn if (string.IsNullOrEmpty(mov.RowKey) || string.IsNullOrEmpty(mov.MovieId)) continue; tblMgr.UpdateMovieById(mov); #endregion #region Crawl Movie Reviews #region Crawler try { BollywoodHungamaReviews bh = new BollywoodHungamaReviews(); HindustanTimesReviews ht = new HindustanTimesReviews(); FilmfareReviews ff = new FilmfareReviews(); CnnIbn cibn = new CnnIbn(); BoxOfficeIndia boi = new BoxOfficeIndia(); Dna dna = new Dna(); FirstPost fp = new FirstPost(); IndianExpress ie = new IndianExpress(); KomalNahta kn = new KomalNahta(); MidDay md = new MidDay(); Ndtv ndtv = new Ndtv(); Rajasen rs = new Rajasen(); Rediff rdf = new Rediff(); Telegraph tg = new Telegraph(); TheHindu th = new TheHindu(); TimesOfIndia toi = new TimesOfIndia(); AnupamaChopra ac = new AnupamaChopra(); MumbaiMirror mm = new MumbaiMirror(); var reviews = movie.SelectNodes("Review"); List<ReviewEntity> reviewList = tblMgr.GetReviewByMovieId(mov.MovieId); foreach (XmlNode review in reviews) { ReviewEntity duplicateRE = reviewList.Find(r => r.Affiliation == review.Attributes["name"].Value); if (duplicateRE != null) { // We found the duplicate, skip this review to crawl continue; } ReviewEntity re = new ReviewEntity(); string reviewLink = review.Attributes["link"].Value; switch (review.Attributes["name"].Value.Trim()) { case "BollywoodHungama": case "Bollywood Hungama": re = bh.Crawl(reviewLink, review.Attributes["name"].Value); break; case "Hindustan Times": re = ht.Crawl(reviewLink, review.Attributes["name"].Value); break; case "Filmfare": re = ff.Crawl(reviewLink, review.Attributes["name"].Value); break; case "CNN IBN": case "CNNIBN": re = cibn.Crawl(reviewLink, review.Attributes["name"].Value); break; case "Box Office India": re = boi.Crawl(reviewLink, review.Attributes["name"].Value); break; case "DNA": re = dna.Crawl(reviewLink, review.Attributes["name"].Value); break; case "FirstPost": re = fp.Crawl(reviewLink, review.Attributes["name"].Value); break; case "Indian Express": re = ie.Crawl(reviewLink, review.Attributes["name"].Value); break; case "Komal Nahta's Blog": re = kn.Crawl(reviewLink, review.Attributes["name"].Value); break; case "Mid Day": case "MidDay": re = md.Crawl(reviewLink, review.Attributes["name"].Value); break; case "NDTV": re = ndtv.Crawl(reviewLink, review.Attributes["name"].Value); break; case "rajasen.com": re = rs.Crawl(reviewLink, review.Attributes["name"].Value); break; case "Rediff": re = rdf.Crawl(reviewLink, review.Attributes["name"].Value); break; case "Telegraph": re = tg.Crawl(reviewLink, review.Attributes["name"].Value); break; case "The Hindu": re = th.Crawl(reviewLink, review.Attributes["name"].Value); break; case "Times of India": re = toi.Crawl(reviewLink, review.Attributes["name"].Value); break; case "anupamachopra.com": re = ac.Crawl(reviewLink, review.Attributes["name"].Value); break; case "Mumbai Mirror": re = mm.Crawl(reviewLink, review.Attributes["name"].Value); break; } if (re == null) continue; critics.Add(re.ReviewerName); // update the IDs - Movie Id, Reviewer Id etc. string reviewerId = ReviewCrawler.SetReviewer(re.ReviewerName, review.Attributes["name"].Value); //re.RowKey = re.ReviewId = new Guid().ToString(); re.ReviewerId = reviewerId; re.MovieId = mov.MovieId; re.OutLink = reviewLink; tblMgr.UpdateReviewById(re); } } catch (Exception) { } #endregion #endregion #region Lucene Search Index List<APIRole.UDT.Cast> casts = json.Deserialize(mov.Cast, typeof(List<APIRole.UDT.Cast>)) as List<APIRole.UDT.Cast>; List<String> posters = json.Deserialize(mov.Posters, typeof(List<String>)) as List<String>; List<String> actors = new List<string>(); if (casts != null) { foreach (var actor in casts) { // actor, director, music, producer string role = actor.role.ToLower(); string characterName = string.IsNullOrEmpty(actor.charactername) ? string.Empty : actor.charactername; // Check if artist is already present in the list for some other role. // If yes, skip it. Also if the actor name is missing then skip the artist if (actors.Contains(actor.name) || string.IsNullOrEmpty(actor.name) || actor.name == "null") continue; // If we want to showcase main artists and not all, keep the following switch... case. switch (role) { case "actor": actors.Add(actor.name); break; case "producer": // some times producer are listed as line producer etc. // We are not interested in those artists as of now?! Hence skipping it if (characterName == role) { actors.Add(actor.name); } break; case "music": case "director": // Main music director and movie director does not have associated character name. // Where as other side directors have associated character name as associate director, assitant director. // Skipping such cases. if (string.IsNullOrEmpty(characterName)) { actors.Add(actor.name); } break; } // If we want to showcase all the technicians //actors.Add(actor.name); } } if (posters != null && posters.Count > 0) { posterUrl = posters[posters.Count - 1]; } // include reviewer & their affiliation in index file MovieSearchData movieSearchIndex = new MovieSearchData(); movieSearchIndex.Id = mov.RowKey; movieSearchIndex.Title = mov.Name; movieSearchIndex.Type = mov.Genre; movieSearchIndex.TitleImageURL = posterUrl; movieSearchIndex.UniqueName = mov.UniqueName; movieSearchIndex.Description = json.Serialize(actors); movieSearchIndex.Critics = json.Serialize(critics); movieSearchIndex.Link = mov.UniqueName; LuceneSearch.AddUpdateLuceneIndex(movieSearchIndex); #endregion } catch (Exception) { Debug.WriteLine("Error while crawling movie - " + movie.Attributes["link"].Value); } } } #endregion } catch (Exception ex) { Debug.WriteLine("Exception: {0}", ex); throw; } }
private void CrawlfromXML(string xmlData, string movieName) { if (string.IsNullOrEmpty(xmlData)) { return; } Crawler.MovieCrawler movieCrawler = new Crawler.MovieCrawler(); JavaScriptSerializer json = new JavaScriptSerializer(); try { XmlDocument xdoc = new XmlDocument(); #region Movie Crawler xdoc.LoadXml(xmlData); var movies = xdoc.SelectNodes("Movies/Month/Movie"); if (movies == null) { return; } foreach (XmlNode movie in movies) { // Check movie name, we just need to crawl single movie and not all the movies present in XML file for current month if (movie.Attributes["name"].Value.ToLower() != movieName.ToLower()) { continue; } if (movie.Attributes["link"] != null && !string.IsNullOrEmpty(movie.Attributes["link"].Value)) { try { List <string> critics = new List <string>(); #region Crawl Movie MovieEntity mov = movieCrawler.Crawl(movie.Attributes["link"].Value); TableManager tblMgr = new TableManager(); // Save the crawled content because in case of new movies, it fails tblMgr.UpdateMovieById(mov); string posterUrl = string.Empty; if (movie.Attributes["santaposterlink"] != null && !string.IsNullOrEmpty(movie.Attributes["santaposterlink"].Value)) { XMLMovieProperties prop = new XMLMovieProperties(); prop.SantaPosterLink = movie.Attributes["santaposterlink"].Value; prop.MovieName = mov.UniqueName; CrawlPosters(json.Serialize(prop)); } // Crawl Songs from Saavn if (string.IsNullOrEmpty(mov.RowKey) || string.IsNullOrEmpty(mov.MovieId)) { continue; } tblMgr.UpdateMovieById(mov); #endregion #region Crawl Movie Reviews #region Crawler try { BollywoodHungamaReviews bh = new BollywoodHungamaReviews(); HindustanTimesReviews ht = new HindustanTimesReviews(); FilmfareReviews ff = new FilmfareReviews(); CnnIbn cibn = new CnnIbn(); BoxOfficeIndia boi = new BoxOfficeIndia(); Dna dna = new Dna(); FirstPost fp = new FirstPost(); IndianExpress ie = new IndianExpress(); KomalNahta kn = new KomalNahta(); MidDay md = new MidDay(); Ndtv ndtv = new Ndtv(); Rajasen rs = new Rajasen(); Rediff rdf = new Rediff(); Telegraph tg = new Telegraph(); TheHindu th = new TheHindu(); TimesOfIndia toi = new TimesOfIndia(); AnupamaChopra ac = new AnupamaChopra(); MumbaiMirror mm = new MumbaiMirror(); var reviews = movie.SelectNodes("Review"); List <ReviewEntity> reviewList = tblMgr.GetReviewByMovieId(mov.MovieId); foreach (XmlNode review in reviews) { ReviewEntity duplicateRE = reviewList.Find(r => r.Affiliation == review.Attributes["name"].Value); if (duplicateRE != null) { // We found the duplicate, skip this review to crawl continue; } ReviewEntity re = new ReviewEntity(); string reviewLink = review.Attributes["link"].Value; switch (review.Attributes["name"].Value.Trim()) { case "BollywoodHungama": case "Bollywood Hungama": re = bh.Crawl(reviewLink, review.Attributes["name"].Value); break; case "Hindustan Times": re = ht.Crawl(reviewLink, review.Attributes["name"].Value); break; case "Filmfare": re = ff.Crawl(reviewLink, review.Attributes["name"].Value); break; case "CNN IBN": case "CNNIBN": re = cibn.Crawl(reviewLink, review.Attributes["name"].Value); break; case "Box Office India": re = boi.Crawl(reviewLink, review.Attributes["name"].Value); break; case "DNA": re = dna.Crawl(reviewLink, review.Attributes["name"].Value); break; case "FirstPost": re = fp.Crawl(reviewLink, review.Attributes["name"].Value); break; case "Indian Express": re = ie.Crawl(reviewLink, review.Attributes["name"].Value); break; case "Komal Nahta's Blog": re = kn.Crawl(reviewLink, review.Attributes["name"].Value); break; case "Mid Day": case "MidDay": re = md.Crawl(reviewLink, review.Attributes["name"].Value); break; case "NDTV": re = ndtv.Crawl(reviewLink, review.Attributes["name"].Value); break; case "rajasen.com": re = rs.Crawl(reviewLink, review.Attributes["name"].Value); break; case "Rediff": re = rdf.Crawl(reviewLink, review.Attributes["name"].Value); break; case "Telegraph": re = tg.Crawl(reviewLink, review.Attributes["name"].Value); break; case "The Hindu": re = th.Crawl(reviewLink, review.Attributes["name"].Value); break; case "Times of India": re = toi.Crawl(reviewLink, review.Attributes["name"].Value); break; case "anupamachopra.com": re = ac.Crawl(reviewLink, review.Attributes["name"].Value); break; case "Mumbai Mirror": re = mm.Crawl(reviewLink, review.Attributes["name"].Value); break; } if (re == null) { continue; } critics.Add(re.ReviewerName); // update the IDs - Movie Id, Reviewer Id etc. string reviewerId = ReviewCrawler.SetReviewer(re.ReviewerName, review.Attributes["name"].Value); //re.RowKey = re.ReviewId = new Guid().ToString(); re.ReviewerId = reviewerId; re.MovieId = mov.MovieId; re.OutLink = reviewLink; tblMgr.UpdateReviewById(re); } } catch (Exception) { } #endregion #endregion #region Lucene Search Index List <APIRole.UDT.Cast> casts = json.Deserialize(mov.Cast, typeof(List <APIRole.UDT.Cast>)) as List <APIRole.UDT.Cast>; List <String> posters = json.Deserialize(mov.Posters, typeof(List <String>)) as List <String>; List <String> actors = new List <string>(); if (casts != null) { foreach (var actor in casts) { // actor, director, music, producer string role = actor.role.ToLower(); string characterName = string.IsNullOrEmpty(actor.charactername) ? string.Empty : actor.charactername; // Check if artist is already present in the list for some other role. // If yes, skip it. Also if the actor name is missing then skip the artist if (actors.Contains(actor.name) || string.IsNullOrEmpty(actor.name) || actor.name == "null") { continue; } // If we want to showcase main artists and not all, keep the following switch... case. switch (role) { case "actor": actors.Add(actor.name); break; case "producer": // some times producer are listed as line producer etc. // We are not interested in those artists as of now?! Hence skipping it if (characterName == role) { actors.Add(actor.name); } break; case "music": case "director": // Main music director and movie director does not have associated character name. // Where as other side directors have associated character name as associate director, assitant director. // Skipping such cases. if (string.IsNullOrEmpty(characterName)) { actors.Add(actor.name); } break; } // If we want to showcase all the technicians //actors.Add(actor.name); } } if (posters != null && posters.Count > 0) { posterUrl = posters[posters.Count - 1]; } // include reviewer & their affiliation in index file MovieSearchData movieSearchIndex = new MovieSearchData(); movieSearchIndex.Id = mov.RowKey; movieSearchIndex.Title = mov.Name; movieSearchIndex.Type = mov.Genre; movieSearchIndex.TitleImageURL = posterUrl; movieSearchIndex.UniqueName = mov.UniqueName; movieSearchIndex.Description = json.Serialize(actors); movieSearchIndex.Critics = json.Serialize(critics); movieSearchIndex.Link = mov.UniqueName; LuceneSearch.AddUpdateLuceneIndex(movieSearchIndex); #endregion } catch (Exception) { Debug.WriteLine("Error while crawling movie - " + movie.Attributes["link"].Value); } } } #endregion } catch (Exception ex) { Debug.WriteLine("Exception: {0}", ex); throw; } }