Пример #1
0
        private void CrawlfromXML(string xmlData, string movieName)
        {
            if (string.IsNullOrEmpty(xmlData)) return;

            Crawler.MovieCrawler movieCrawler = new Crawler.MovieCrawler();
            JavaScriptSerializer json = new JavaScriptSerializer();

            try
            {
                XmlDocument xdoc = new XmlDocument();

                #region Movie Crawler
                xdoc.LoadXml(xmlData);
                var movies = xdoc.SelectNodes("Movies/Month/Movie");
                if (movies == null) return;

                foreach (XmlNode movie in movies)
                {
                    // Check movie name, we just need to crawl single movie and not all the movies present in XML file for current month
                    if (movie.Attributes["name"].Value.ToLower() != movieName.ToLower())
                    {
                        continue;
                    }

                    if (movie.Attributes["link"] != null && !string.IsNullOrEmpty(movie.Attributes["link"].Value))
                    {
                        try
                        {
                            List<string> critics = new List<string>();
                            #region Crawl Movie
                            MovieEntity mov = movieCrawler.Crawl(movie.Attributes["link"].Value);
                            TableManager tblMgr = new TableManager();
                            // Save the crawled content because in case of new movies, it fails
                            tblMgr.UpdateMovieById(mov);

                            string posterUrl = string.Empty;

                            if (movie.Attributes["santaposterlink"] != null && !string.IsNullOrEmpty(movie.Attributes["santaposterlink"].Value))
                            {
                                XMLMovieProperties prop = new XMLMovieProperties();
                                prop.SantaPosterLink = movie.Attributes["santaposterlink"].Value;
                                prop.MovieName = mov.UniqueName;

                                CrawlPosters(json.Serialize(prop));
                            }

                            // Crawl Songs from Saavn

                            if (string.IsNullOrEmpty(mov.RowKey) || string.IsNullOrEmpty(mov.MovieId)) continue;

                            tblMgr.UpdateMovieById(mov);
                            #endregion

                            #region Crawl Movie Reviews
                            #region Crawler
                            try
                            {
                                BollywoodHungamaReviews bh = new BollywoodHungamaReviews();
                                HindustanTimesReviews ht = new HindustanTimesReviews();
                                FilmfareReviews ff = new FilmfareReviews();
                                CnnIbn cibn = new CnnIbn();
                                BoxOfficeIndia boi = new BoxOfficeIndia();
                                Dna dna = new Dna();
                                FirstPost fp = new FirstPost();
                                IndianExpress ie = new IndianExpress();
                                KomalNahta kn = new KomalNahta();
                                MidDay md = new MidDay();
                                Ndtv ndtv = new Ndtv();
                                Rajasen rs = new Rajasen();
                                Rediff rdf = new Rediff();
                                Telegraph tg = new Telegraph();
                                TheHindu th = new TheHindu();
                                TimesOfIndia toi = new TimesOfIndia();
                                AnupamaChopra ac = new AnupamaChopra();
                                MumbaiMirror mm = new MumbaiMirror();

                                var reviews = movie.SelectNodes("Review");

                                List<ReviewEntity> reviewList = tblMgr.GetReviewByMovieId(mov.MovieId);

                                foreach (XmlNode review in reviews)
                                {
                                    ReviewEntity duplicateRE = reviewList.Find(r => r.Affiliation == review.Attributes["name"].Value);
                                    if (duplicateRE != null)
                                    {
                                        // We found the duplicate, skip this review to crawl
                                        continue;
                                    }

                                    ReviewEntity re = new ReviewEntity();
                                    string reviewLink = review.Attributes["link"].Value;

                                    switch (review.Attributes["name"].Value.Trim())
                                    {
                                        case "BollywoodHungama":
                                        case "Bollywood Hungama":
                                            re = bh.Crawl(reviewLink, review.Attributes["name"].Value);
                                            break;
                                        case "Hindustan Times":
                                            re = ht.Crawl(reviewLink, review.Attributes["name"].Value);
                                            break;
                                        case "Filmfare":
                                            re = ff.Crawl(reviewLink, review.Attributes["name"].Value);
                                            break;
                                        case "CNN IBN":
                                        case "CNNIBN":
                                            re = cibn.Crawl(reviewLink, review.Attributes["name"].Value);
                                            break;
                                        case "Box Office India":
                                            re = boi.Crawl(reviewLink, review.Attributes["name"].Value);
                                            break;
                                        case "DNA":
                                            re = dna.Crawl(reviewLink, review.Attributes["name"].Value);
                                            break;
                                        case "FirstPost":
                                            re = fp.Crawl(reviewLink, review.Attributes["name"].Value);
                                            break;
                                        case "Indian Express":
                                            re = ie.Crawl(reviewLink, review.Attributes["name"].Value);
                                            break;
                                        case "Komal Nahta's Blog":
                                            re = kn.Crawl(reviewLink, review.Attributes["name"].Value);
                                            break;
                                        case "Mid Day":
                                        case "MidDay":
                                            re = md.Crawl(reviewLink, review.Attributes["name"].Value);
                                            break;
                                        case "NDTV":
                                            re = ndtv.Crawl(reviewLink, review.Attributes["name"].Value);
                                            break;
                                        case "rajasen.com":
                                            re = rs.Crawl(reviewLink, review.Attributes["name"].Value);
                                            break;
                                        case "Rediff":
                                            re = rdf.Crawl(reviewLink, review.Attributes["name"].Value);
                                            break;
                                        case "Telegraph":
                                            re = tg.Crawl(reviewLink, review.Attributes["name"].Value);
                                            break;
                                        case "The Hindu":
                                            re = th.Crawl(reviewLink, review.Attributes["name"].Value);
                                            break;
                                        case "Times of India":
                                            re = toi.Crawl(reviewLink, review.Attributes["name"].Value);
                                            break;
                                        case "anupamachopra.com":
                                            re = ac.Crawl(reviewLink, review.Attributes["name"].Value);
                                            break;
                                        case "Mumbai Mirror":
                                            re = mm.Crawl(reviewLink, review.Attributes["name"].Value);
                                            break;
                                    }

                                    if (re == null)
                                        continue;

                                    critics.Add(re.ReviewerName);

                                    // update the IDs - Movie Id, Reviewer Id etc.
                                    string reviewerId = ReviewCrawler.SetReviewer(re.ReviewerName, review.Attributes["name"].Value);
                                    //re.RowKey = re.ReviewId = new Guid().ToString();
                                    re.ReviewerId = reviewerId;
                                    re.MovieId = mov.MovieId;
                                    re.OutLink = reviewLink;
                                    tblMgr.UpdateReviewById(re);
                                }
                            }
                            catch (Exception)
                            {
                            }
                            #endregion
                            #endregion

                            #region Lucene Search Index
                            List<APIRole.UDT.Cast> casts = json.Deserialize(mov.Cast, typeof(List<APIRole.UDT.Cast>)) as List<APIRole.UDT.Cast>;
                            List<String> posters = json.Deserialize(mov.Posters, typeof(List<String>)) as List<String>;
                            List<String> actors = new List<string>();

                            if (casts != null)
                            {
                                foreach (var actor in casts)
                                {
                                    // actor, director, music, producer
                                    string role = actor.role.ToLower();
                                    string characterName = string.IsNullOrEmpty(actor.charactername) ? string.Empty : actor.charactername;

                                    // Check if artist is already present in the list for some other role.
                                    // If yes, skip it. Also if the actor name is missing then skip the artist
                                    if (actors.Contains(actor.name) || string.IsNullOrEmpty(actor.name) || actor.name == "null")
                                        continue;

                                    // If we want to showcase main artists and not all, keep the following switch... case.
                                    switch (role)
                                    {
                                        case "actor":
                                            actors.Add(actor.name);
                                            break;
                                        case "producer":
                                            // some times producer are listed as line producer etc.
                                            // We are not interested in those artists as of now?! Hence skipping it
                                            if (characterName == role)
                                            {
                                                actors.Add(actor.name);
                                            }
                                            break;
                                        case "music":
                                        case "director":
                                            // Main music director and movie director does not have associated character name.
                                            // Where as other side directors have associated character name as associate director, assitant director.
                                            // Skipping such cases.
                                            if (string.IsNullOrEmpty(characterName))
                                            {
                                                actors.Add(actor.name);
                                            }
                                            break;
                                    }

                                    // If we want to showcase all the technicians
                                    //actors.Add(actor.name);
                                }
                            }

                            if (posters != null && posters.Count > 0)
                            {
                                posterUrl = posters[posters.Count - 1];
                            }

                            // include reviewer & their affiliation in index file
                            MovieSearchData movieSearchIndex = new MovieSearchData();
                            movieSearchIndex.Id = mov.RowKey;
                            movieSearchIndex.Title = mov.Name;
                            movieSearchIndex.Type = mov.Genre;
                            movieSearchIndex.TitleImageURL = posterUrl;
                            movieSearchIndex.UniqueName = mov.UniqueName;
                            movieSearchIndex.Description = json.Serialize(actors);
                            movieSearchIndex.Critics = json.Serialize(critics);
                            movieSearchIndex.Link = mov.UniqueName;
                            LuceneSearch.AddUpdateLuceneIndex(movieSearchIndex);
                            #endregion
                        }
                        catch (Exception)
                        {
                            Debug.WriteLine("Error while crawling movie - " + movie.Attributes["link"].Value);
                        }
                    }
                }

                #endregion
            }
            catch (Exception ex)
            {
                Debug.WriteLine("Exception: {0}", ex);
                throw;
            }
        }
Пример #2
0
 internal static void UploadAlgorithmRunLogs(string physicalPath, string reviewId)
 {
     TableManager tm = new TableManager();
     string blobPath = Util.UploadLogFile(physicalPath);
     ReviewEntity re = tm.GetReviewById(reviewId);
     re.AlgoLogUrl = blobPath;
     tm.UpdateReviewById(re);
 }
Пример #3
0
        internal static string SetReviewAndUpdateMovieRating(string movieId, string reviewId, int rating, string bag)
        {
            var tableMgr = new TableManager();
            MovieEntity movie = tableMgr.GetMovieById(movieId);

            if (movie != null)
            {
                ReviewEntity review = tableMgr.GetReviewById(reviewId);

                if (review != null)
                {
                    // -1 => Negative
                    //  0 => No rating
                    // +1 => Positive
                    rating = (rating < 0) ? -1 : 1;

                    review.SystemRating = rating;
                    tableMgr.UpdateReviewById(review);

                    string myscore = movie.MyScore;
                    if (string.IsNullOrEmpty(myscore) || myscore == "0")
                    {
                        myscore = "{\"teekharating\":\"0\",\"feekharating\":\"0\",\"criticrating\":\"\"}";
                    }

                    RatingConvertion newRating = new RatingConvertion();
                    RatingConvertion oldRating;
                    try
                    {
                        oldRating = jsonSerializer.Value.Deserialize(myscore, typeof(RatingConvertion)) as RatingConvertion;
                    }
                    catch
                    {
                        myscore = "{\"teekharating\":\"0\",\"feekharating\":\"0\",\"criticrating\":\"\"}";
                        oldRating = jsonSerializer.Value.Deserialize(myscore, typeof(RatingConvertion)) as RatingConvertion;
                    }

                    var teekha = oldRating.teekharating + (rating > 0 ? 1 : 0);
                    var feekha = oldRating.feekharating + (rating < 0 ? 1 : 0);
                    newRating.teekharating = teekha;
                    newRating.feekharating = feekha;
                    newRating.criticrating = ((int)(teekha / (double)(teekha + feekha) * 100)).ToString();

                    string strNewRating = jsonSerializer.Value.Serialize(newRating);
                    movie.Rating = newRating.criticrating;
                    movie.MyScore = strNewRating;
                    tableMgr.UpdateMovieById(movie);

                    return jsonSerializer.Value.Serialize(new { Status = "Ok", UserMessage = "Successfully update movie rating" });
                }
                else
                {
                    return jsonSerializer.Value.Serialize(new { Status = "Error", UserMessage = "Unable to find review with passed review id. Please check review id." });
                }
            }
            else
            {
                return jsonSerializer.Value.Serialize(new { Status = "Error", UserMessage = "Unable to find movie with passed movie id. Please check movie id." });
            }
        }
Пример #4
0
        internal static void SetTagsForReview(string reviewId, string filePath)
        {
            var lines = File.ReadAllLines(filePath);

            // Input:Sentiment: thumbsdown
            //var sentiment =
            //    lines
            //        .First(line => line.StartsWith("Sentiment: "))
            //        .Replace("Sentiment: ", "")
            //        .Trim();

            // Input:	Word: drama 	POS-tagger:  NN 	POS-SWN:  n 	Tag: POS 	Sentiment:  0.13774104683195595 	DebugString: null
            var terms =
                lines
                    .Where(line => line.Contains("POS-tagger:"))
                    .Select(line => line.Split('\t')
                        .Skip(1)
                        .Select(l => l.Trim()
                            .Split(':')
                            .Select(ll => ll.Trim())
                            .ToArray())
                        .ToDictionary(l => l[0], l => l[1]))
                        .Where(l => l["DebugString"] == "bigram_a_n")
                    .ToList();

            //terms.Sort((a, b) => double.Parse(a["Sentiment"]).CompareTo(double.Parse(b["Sentiment"])));

            //var pos = terms
            //    .Take(sentiment == "thumbsdown" ? 6 : 4);
            //var neg = terms
            //    .Skip(Math.Max(0, terms.Count - (sentiment == "thumbsup" ? 6 : 4)))
            //    .Take(sentiment == "thumbsup" ? 6 : 4);

            var tags =
                string.Join(", ",
                    terms
                //pos
                //.Concat(neg)
                        .Select(term => term["Word"].Replace("_", " ")));

            var tableMgr = new TableManager();
            ReviewEntity review = tableMgr.GetReviewById(reviewId);
            if (review != null)
            {
                review.Tags = tags;
                tableMgr.UpdateReviewById(review);
            }
        }