private void CrawlfromXML(string xmlData, string movieName) { if (string.IsNullOrEmpty(xmlData)) return; Crawler.MovieCrawler movieCrawler = new Crawler.MovieCrawler(); JavaScriptSerializer json = new JavaScriptSerializer(); try { XmlDocument xdoc = new XmlDocument(); #region Movie Crawler xdoc.LoadXml(xmlData); var movies = xdoc.SelectNodes("Movies/Month/Movie"); if (movies == null) return; foreach (XmlNode movie in movies) { // Check movie name, we just need to crawl single movie and not all the movies present in XML file for current month if (movie.Attributes["name"].Value.ToLower() != movieName.ToLower()) { continue; } if (movie.Attributes["link"] != null && !string.IsNullOrEmpty(movie.Attributes["link"].Value)) { try { List<string> critics = new List<string>(); #region Crawl Movie MovieEntity mov = movieCrawler.Crawl(movie.Attributes["link"].Value); TableManager tblMgr = new TableManager(); // Save the crawled content because in case of new movies, it fails tblMgr.UpdateMovieById(mov); string posterUrl = string.Empty; if (movie.Attributes["santaposterlink"] != null && !string.IsNullOrEmpty(movie.Attributes["santaposterlink"].Value)) { XMLMovieProperties prop = new XMLMovieProperties(); prop.SantaPosterLink = movie.Attributes["santaposterlink"].Value; prop.MovieName = mov.UniqueName; CrawlPosters(json.Serialize(prop)); } // Crawl Songs from Saavn if (string.IsNullOrEmpty(mov.RowKey) || string.IsNullOrEmpty(mov.MovieId)) continue; tblMgr.UpdateMovieById(mov); #endregion #region Crawl Movie Reviews #region Crawler try { BollywoodHungamaReviews bh = new BollywoodHungamaReviews(); HindustanTimesReviews ht = new HindustanTimesReviews(); FilmfareReviews ff = new FilmfareReviews(); CnnIbn cibn = new CnnIbn(); BoxOfficeIndia boi = new BoxOfficeIndia(); Dna dna = new Dna(); FirstPost fp = new FirstPost(); IndianExpress ie = new IndianExpress(); KomalNahta kn = new KomalNahta(); MidDay md = new MidDay(); Ndtv ndtv = new Ndtv(); Rajasen rs = new Rajasen(); Rediff rdf = new Rediff(); Telegraph tg = new Telegraph(); TheHindu th = new TheHindu(); TimesOfIndia toi = new TimesOfIndia(); AnupamaChopra ac = new AnupamaChopra(); MumbaiMirror mm = new MumbaiMirror(); var reviews = movie.SelectNodes("Review"); List<ReviewEntity> reviewList = tblMgr.GetReviewByMovieId(mov.MovieId); foreach (XmlNode review in reviews) { ReviewEntity duplicateRE = reviewList.Find(r => r.Affiliation == review.Attributes["name"].Value); if (duplicateRE != null) { // We found the duplicate, skip this review to crawl continue; } ReviewEntity re = new ReviewEntity(); string reviewLink = review.Attributes["link"].Value; switch (review.Attributes["name"].Value.Trim()) { case "BollywoodHungama": case "Bollywood Hungama": re = bh.Crawl(reviewLink, review.Attributes["name"].Value); break; case "Hindustan Times": re = ht.Crawl(reviewLink, review.Attributes["name"].Value); break; case "Filmfare": re = ff.Crawl(reviewLink, review.Attributes["name"].Value); break; case "CNN IBN": case "CNNIBN": re = cibn.Crawl(reviewLink, review.Attributes["name"].Value); break; case "Box Office India": re = boi.Crawl(reviewLink, review.Attributes["name"].Value); break; case "DNA": re = dna.Crawl(reviewLink, review.Attributes["name"].Value); break; case "FirstPost": re = fp.Crawl(reviewLink, review.Attributes["name"].Value); break; case "Indian Express": re = ie.Crawl(reviewLink, review.Attributes["name"].Value); break; case "Komal Nahta's Blog": re = kn.Crawl(reviewLink, review.Attributes["name"].Value); break; case "Mid Day": case "MidDay": re = md.Crawl(reviewLink, review.Attributes["name"].Value); break; case "NDTV": re = ndtv.Crawl(reviewLink, review.Attributes["name"].Value); break; case "rajasen.com": re = rs.Crawl(reviewLink, review.Attributes["name"].Value); break; case "Rediff": re = rdf.Crawl(reviewLink, review.Attributes["name"].Value); break; case "Telegraph": re = tg.Crawl(reviewLink, review.Attributes["name"].Value); break; case "The Hindu": re = th.Crawl(reviewLink, review.Attributes["name"].Value); break; case "Times of India": re = toi.Crawl(reviewLink, review.Attributes["name"].Value); break; case "anupamachopra.com": re = ac.Crawl(reviewLink, review.Attributes["name"].Value); break; case "Mumbai Mirror": re = mm.Crawl(reviewLink, review.Attributes["name"].Value); break; } if (re == null) continue; critics.Add(re.ReviewerName); // update the IDs - Movie Id, Reviewer Id etc. string reviewerId = ReviewCrawler.SetReviewer(re.ReviewerName, review.Attributes["name"].Value); //re.RowKey = re.ReviewId = new Guid().ToString(); re.ReviewerId = reviewerId; re.MovieId = mov.MovieId; re.OutLink = reviewLink; tblMgr.UpdateReviewById(re); } } catch (Exception) { } #endregion #endregion #region Lucene Search Index List<APIRole.UDT.Cast> casts = json.Deserialize(mov.Cast, typeof(List<APIRole.UDT.Cast>)) as List<APIRole.UDT.Cast>; List<String> posters = json.Deserialize(mov.Posters, typeof(List<String>)) as List<String>; List<String> actors = new List<string>(); if (casts != null) { foreach (var actor in casts) { // actor, director, music, producer string role = actor.role.ToLower(); string characterName = string.IsNullOrEmpty(actor.charactername) ? string.Empty : actor.charactername; // Check if artist is already present in the list for some other role. // If yes, skip it. Also if the actor name is missing then skip the artist if (actors.Contains(actor.name) || string.IsNullOrEmpty(actor.name) || actor.name == "null") continue; // If we want to showcase main artists and not all, keep the following switch... case. switch (role) { case "actor": actors.Add(actor.name); break; case "producer": // some times producer are listed as line producer etc. // We are not interested in those artists as of now?! Hence skipping it if (characterName == role) { actors.Add(actor.name); } break; case "music": case "director": // Main music director and movie director does not have associated character name. // Where as other side directors have associated character name as associate director, assitant director. // Skipping such cases. if (string.IsNullOrEmpty(characterName)) { actors.Add(actor.name); } break; } // If we want to showcase all the technicians //actors.Add(actor.name); } } if (posters != null && posters.Count > 0) { posterUrl = posters[posters.Count - 1]; } // include reviewer & their affiliation in index file MovieSearchData movieSearchIndex = new MovieSearchData(); movieSearchIndex.Id = mov.RowKey; movieSearchIndex.Title = mov.Name; movieSearchIndex.Type = mov.Genre; movieSearchIndex.TitleImageURL = posterUrl; movieSearchIndex.UniqueName = mov.UniqueName; movieSearchIndex.Description = json.Serialize(actors); movieSearchIndex.Critics = json.Serialize(critics); movieSearchIndex.Link = mov.UniqueName; LuceneSearch.AddUpdateLuceneIndex(movieSearchIndex); #endregion } catch (Exception) { Debug.WriteLine("Error while crawling movie - " + movie.Attributes["link"].Value); } } } #endregion } catch (Exception ex) { Debug.WriteLine("Exception: {0}", ex); throw; } }
internal static void UploadAlgorithmRunLogs(string physicalPath, string reviewId) { TableManager tm = new TableManager(); string blobPath = Util.UploadLogFile(physicalPath); ReviewEntity re = tm.GetReviewById(reviewId); re.AlgoLogUrl = blobPath; tm.UpdateReviewById(re); }
internal static string SetReviewAndUpdateMovieRating(string movieId, string reviewId, int rating, string bag) { var tableMgr = new TableManager(); MovieEntity movie = tableMgr.GetMovieById(movieId); if (movie != null) { ReviewEntity review = tableMgr.GetReviewById(reviewId); if (review != null) { // -1 => Negative // 0 => No rating // +1 => Positive rating = (rating < 0) ? -1 : 1; review.SystemRating = rating; tableMgr.UpdateReviewById(review); string myscore = movie.MyScore; if (string.IsNullOrEmpty(myscore) || myscore == "0") { myscore = "{\"teekharating\":\"0\",\"feekharating\":\"0\",\"criticrating\":\"\"}"; } RatingConvertion newRating = new RatingConvertion(); RatingConvertion oldRating; try { oldRating = jsonSerializer.Value.Deserialize(myscore, typeof(RatingConvertion)) as RatingConvertion; } catch { myscore = "{\"teekharating\":\"0\",\"feekharating\":\"0\",\"criticrating\":\"\"}"; oldRating = jsonSerializer.Value.Deserialize(myscore, typeof(RatingConvertion)) as RatingConvertion; } var teekha = oldRating.teekharating + (rating > 0 ? 1 : 0); var feekha = oldRating.feekharating + (rating < 0 ? 1 : 0); newRating.teekharating = teekha; newRating.feekharating = feekha; newRating.criticrating = ((int)(teekha / (double)(teekha + feekha) * 100)).ToString(); string strNewRating = jsonSerializer.Value.Serialize(newRating); movie.Rating = newRating.criticrating; movie.MyScore = strNewRating; tableMgr.UpdateMovieById(movie); return jsonSerializer.Value.Serialize(new { Status = "Ok", UserMessage = "Successfully update movie rating" }); } else { return jsonSerializer.Value.Serialize(new { Status = "Error", UserMessage = "Unable to find review with passed review id. Please check review id." }); } } else { return jsonSerializer.Value.Serialize(new { Status = "Error", UserMessage = "Unable to find movie with passed movie id. Please check movie id." }); } }
internal static void SetTagsForReview(string reviewId, string filePath) { var lines = File.ReadAllLines(filePath); // Input:Sentiment: thumbsdown //var sentiment = // lines // .First(line => line.StartsWith("Sentiment: ")) // .Replace("Sentiment: ", "") // .Trim(); // Input: Word: drama POS-tagger: NN POS-SWN: n Tag: POS Sentiment: 0.13774104683195595 DebugString: null var terms = lines .Where(line => line.Contains("POS-tagger:")) .Select(line => line.Split('\t') .Skip(1) .Select(l => l.Trim() .Split(':') .Select(ll => ll.Trim()) .ToArray()) .ToDictionary(l => l[0], l => l[1])) .Where(l => l["DebugString"] == "bigram_a_n") .ToList(); //terms.Sort((a, b) => double.Parse(a["Sentiment"]).CompareTo(double.Parse(b["Sentiment"]))); //var pos = terms // .Take(sentiment == "thumbsdown" ? 6 : 4); //var neg = terms // .Skip(Math.Max(0, terms.Count - (sentiment == "thumbsup" ? 6 : 4))) // .Take(sentiment == "thumbsup" ? 6 : 4); var tags = string.Join(", ", terms //pos //.Concat(neg) .Select(term => term["Word"].Replace("_", " "))); var tableMgr = new TableManager(); ReviewEntity review = tableMgr.GetReviewById(reviewId); if (review != null) { review.Tags = tags; tableMgr.UpdateReviewById(review); } }