Esempio n. 1
0
        /// <summary>
        /// This is a private method that creates a single search index entry based on our data, and it will be reused by public methods.
        /// </summary>
        /// <param name="movieSearchData">Object of MovieSearchData type</param>
        /// <param name="writer"></param>
        private static void _addToLuceneIndex(MovieSearchData movieSearchData, IndexWriter writer)
        {
            // remove older index entry
            var searchQuery = new TermQuery(new Term("Id", movieSearchData.Id.ToString()));

            writer.DeleteDocuments(searchQuery);

            // add new index entry
            var doc = new Document();

            // add lucene fields mapped to db fields
            doc.Add(new Field("Id", movieSearchData.Id.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
            doc.Add(new Field("Title", movieSearchData.Title, Field.Store.YES, Field.Index.ANALYZED));
            doc.Add(new Field("UniqueName", movieSearchData.UniqueName, Field.Store.YES, Field.Index.ANALYZED));
            doc.Add(new Field("TitleImageURL", movieSearchData.TitleImageURL, Field.Store.YES, Field.Index.ANALYZED));
            doc.Add(new Field("Type", movieSearchData.Type, Field.Store.YES, Field.Index.ANALYZED));
            doc.Add(new Field("Link", movieSearchData.Link, Field.Store.YES, Field.Index.ANALYZED));
            doc.Add(new Field("Description", movieSearchData.Description, Field.Store.YES, Field.Index.ANALYZED));
            doc.Add(new Field("Critics", movieSearchData.Critics, Field.Store.YES, Field.Index.ANALYZED));

            // add entry to index
            writer.AddDocument(doc);
        }
        private void CrawlfromXML(string xmlData, string movieName)
        {
            if (string.IsNullOrEmpty(xmlData)) return;

            Crawler.MovieCrawler movieCrawler = new Crawler.MovieCrawler();
            JavaScriptSerializer json = new JavaScriptSerializer();

            try
            {
                XmlDocument xdoc = new XmlDocument();

                #region Movie Crawler
                xdoc.LoadXml(xmlData);
                var movies = xdoc.SelectNodes("Movies/Month/Movie");
                if (movies == null) return;

                foreach (XmlNode movie in movies)
                {
                    // Check movie name, we just need to crawl single movie and not all the movies present in XML file for current month
                    if (movie.Attributes["name"].Value.ToLower() != movieName.ToLower())
                    {
                        continue;
                    }

                    if (movie.Attributes["link"] != null && !string.IsNullOrEmpty(movie.Attributes["link"].Value))
                    {
                        try
                        {
                            List<string> critics = new List<string>();
                            #region Crawl Movie
                            MovieEntity mov = movieCrawler.Crawl(movie.Attributes["link"].Value);
                            TableManager tblMgr = new TableManager();
                            // Save the crawled content because in case of new movies, it fails
                            tblMgr.UpdateMovieById(mov);

                            string posterUrl = string.Empty;

                            if (movie.Attributes["santaposterlink"] != null && !string.IsNullOrEmpty(movie.Attributes["santaposterlink"].Value))
                            {
                                XMLMovieProperties prop = new XMLMovieProperties();
                                prop.SantaPosterLink = movie.Attributes["santaposterlink"].Value;
                                prop.MovieName = mov.UniqueName;

                                CrawlPosters(json.Serialize(prop));
                            }

                            // Crawl Songs from Saavn

                            if (string.IsNullOrEmpty(mov.RowKey) || string.IsNullOrEmpty(mov.MovieId)) continue;

                            tblMgr.UpdateMovieById(mov);
                            #endregion

                            #region Crawl Movie Reviews
                            #region Crawler
                            try
                            {
                                BollywoodHungamaReviews bh = new BollywoodHungamaReviews();
                                HindustanTimesReviews ht = new HindustanTimesReviews();
                                FilmfareReviews ff = new FilmfareReviews();
                                CnnIbn cibn = new CnnIbn();
                                BoxOfficeIndia boi = new BoxOfficeIndia();
                                Dna dna = new Dna();
                                FirstPost fp = new FirstPost();
                                IndianExpress ie = new IndianExpress();
                                KomalNahta kn = new KomalNahta();
                                MidDay md = new MidDay();
                                Ndtv ndtv = new Ndtv();
                                Rajasen rs = new Rajasen();
                                Rediff rdf = new Rediff();
                                Telegraph tg = new Telegraph();
                                TheHindu th = new TheHindu();
                                TimesOfIndia toi = new TimesOfIndia();
                                AnupamaChopra ac = new AnupamaChopra();
                                MumbaiMirror mm = new MumbaiMirror();

                                var reviews = movie.SelectNodes("Review");

                                List<ReviewEntity> reviewList = tblMgr.GetReviewByMovieId(mov.MovieId);

                                foreach (XmlNode review in reviews)
                                {
                                    ReviewEntity duplicateRE = reviewList.Find(r => r.Affiliation == review.Attributes["name"].Value);
                                    if (duplicateRE != null)
                                    {
                                        // We found the duplicate, skip this review to crawl
                                        continue;
                                    }

                                    ReviewEntity re = new ReviewEntity();
                                    string reviewLink = review.Attributes["link"].Value;

                                    switch (review.Attributes["name"].Value.Trim())
                                    {
                                        case "BollywoodHungama":
                                        case "Bollywood Hungama":
                                            re = bh.Crawl(reviewLink, review.Attributes["name"].Value);
                                            break;
                                        case "Hindustan Times":
                                            re = ht.Crawl(reviewLink, review.Attributes["name"].Value);
                                            break;
                                        case "Filmfare":
                                            re = ff.Crawl(reviewLink, review.Attributes["name"].Value);
                                            break;
                                        case "CNN IBN":
                                        case "CNNIBN":
                                            re = cibn.Crawl(reviewLink, review.Attributes["name"].Value);
                                            break;
                                        case "Box Office India":
                                            re = boi.Crawl(reviewLink, review.Attributes["name"].Value);
                                            break;
                                        case "DNA":
                                            re = dna.Crawl(reviewLink, review.Attributes["name"].Value);
                                            break;
                                        case "FirstPost":
                                            re = fp.Crawl(reviewLink, review.Attributes["name"].Value);
                                            break;
                                        case "Indian Express":
                                            re = ie.Crawl(reviewLink, review.Attributes["name"].Value);
                                            break;
                                        case "Komal Nahta's Blog":
                                            re = kn.Crawl(reviewLink, review.Attributes["name"].Value);
                                            break;
                                        case "Mid Day":
                                        case "MidDay":
                                            re = md.Crawl(reviewLink, review.Attributes["name"].Value);
                                            break;
                                        case "NDTV":
                                            re = ndtv.Crawl(reviewLink, review.Attributes["name"].Value);
                                            break;
                                        case "rajasen.com":
                                            re = rs.Crawl(reviewLink, review.Attributes["name"].Value);
                                            break;
                                        case "Rediff":
                                            re = rdf.Crawl(reviewLink, review.Attributes["name"].Value);
                                            break;
                                        case "Telegraph":
                                            re = tg.Crawl(reviewLink, review.Attributes["name"].Value);
                                            break;
                                        case "The Hindu":
                                            re = th.Crawl(reviewLink, review.Attributes["name"].Value);
                                            break;
                                        case "Times of India":
                                            re = toi.Crawl(reviewLink, review.Attributes["name"].Value);
                                            break;
                                        case "anupamachopra.com":
                                            re = ac.Crawl(reviewLink, review.Attributes["name"].Value);
                                            break;
                                        case "Mumbai Mirror":
                                            re = mm.Crawl(reviewLink, review.Attributes["name"].Value);
                                            break;
                                    }

                                    if (re == null)
                                        continue;

                                    critics.Add(re.ReviewerName);

                                    // update the IDs - Movie Id, Reviewer Id etc.
                                    string reviewerId = ReviewCrawler.SetReviewer(re.ReviewerName, review.Attributes["name"].Value);
                                    //re.RowKey = re.ReviewId = new Guid().ToString();
                                    re.ReviewerId = reviewerId;
                                    re.MovieId = mov.MovieId;
                                    re.OutLink = reviewLink;
                                    tblMgr.UpdateReviewById(re);
                                }
                            }
                            catch (Exception)
                            {
                            }
                            #endregion
                            #endregion

                            #region Lucene Search Index
                            List<APIRole.UDT.Cast> casts = json.Deserialize(mov.Cast, typeof(List<APIRole.UDT.Cast>)) as List<APIRole.UDT.Cast>;
                            List<String> posters = json.Deserialize(mov.Posters, typeof(List<String>)) as List<String>;
                            List<String> actors = new List<string>();

                            if (casts != null)
                            {
                                foreach (var actor in casts)
                                {
                                    // actor, director, music, producer
                                    string role = actor.role.ToLower();
                                    string characterName = string.IsNullOrEmpty(actor.charactername) ? string.Empty : actor.charactername;

                                    // Check if artist is already present in the list for some other role.
                                    // If yes, skip it. Also if the actor name is missing then skip the artist
                                    if (actors.Contains(actor.name) || string.IsNullOrEmpty(actor.name) || actor.name == "null")
                                        continue;

                                    // If we want to showcase main artists and not all, keep the following switch... case.
                                    switch (role)
                                    {
                                        case "actor":
                                            actors.Add(actor.name);
                                            break;
                                        case "producer":
                                            // some times producer are listed as line producer etc.
                                            // We are not interested in those artists as of now?! Hence skipping it
                                            if (characterName == role)
                                            {
                                                actors.Add(actor.name);
                                            }
                                            break;
                                        case "music":
                                        case "director":
                                            // Main music director and movie director does not have associated character name.
                                            // Where as other side directors have associated character name as associate director, assitant director.
                                            // Skipping such cases.
                                            if (string.IsNullOrEmpty(characterName))
                                            {
                                                actors.Add(actor.name);
                                            }
                                            break;
                                    }

                                    // If we want to showcase all the technicians
                                    //actors.Add(actor.name);
                                }
                            }

                            if (posters != null && posters.Count > 0)
                            {
                                posterUrl = posters[posters.Count - 1];
                            }

                            // include reviewer & their affiliation in index file
                            MovieSearchData movieSearchIndex = new MovieSearchData();
                            movieSearchIndex.Id = mov.RowKey;
                            movieSearchIndex.Title = mov.Name;
                            movieSearchIndex.Type = mov.Genre;
                            movieSearchIndex.TitleImageURL = posterUrl;
                            movieSearchIndex.UniqueName = mov.UniqueName;
                            movieSearchIndex.Description = json.Serialize(actors);
                            movieSearchIndex.Critics = json.Serialize(critics);
                            movieSearchIndex.Link = mov.UniqueName;
                            LuceneSearch.AddUpdateLuceneIndex(movieSearchIndex);
                            #endregion
                        }
                        catch (Exception)
                        {
                            Debug.WriteLine("Error while crawling movie - " + movie.Attributes["link"].Value);
                        }
                    }
                }

                #endregion
            }
            catch (Exception ex)
            {
                Debug.WriteLine("Exception: {0}", ex);
                throw;
            }
        }
Esempio n. 3
0
 /// <summary>
 /// This is a public method that will add a single record to search index:
 /// </summary>
 /// <param name="movieSearchData"></param>
 public static void AddUpdateLuceneIndex(MovieSearchData movieSearchData)
 {
     AddUpdateLuceneIndex(new MovieSearchData[] { movieSearchData });
 }
Esempio n. 4
0
        private static void UpdateLuceneIndex(MovieEntity movie)
        {
            var tableMgr = new TableManager();

            // Update Lucene
            Task.Run(() =>
            {
                //delete Entry in lucene search index
                // Fix following method call - What shall be other param?
                LuceneSearch.ClearLuceneIndexRecord(movie.MovieId, "Id");
                LuceneSearch.ClearLuceneIndexRecord(movie.UniqueName, "UniqueName");

                string posterUrl = "default-movie.jpg";
                string critics = string.Empty;

                if (!string.IsNullOrEmpty(movie.Posters))
                {
                    List<string> pList = jsonSerializer.Value.Deserialize(movie.Posters, typeof(List<string>)) as List<string>;
                    if (pList != null && pList.Count > 0)
                    {
                        posterUrl = pList.Last();
                    }
                }

                var reviewDic = tableMgr.GetReviewsByMovieId(movie.MovieId);
                if (reviewDic != null && reviewDic.Values != null && reviewDic.Values.Count > 0)
                {
                    critics = jsonSerializer.Value.Serialize(reviewDic.Values.Select(re => re.ReviewerName));
                }

                // add updated entry in lucene search index
                MovieSearchData movieSearchIndex = new MovieSearchData();
                movieSearchIndex.Id = movie.RowKey;
                movieSearchIndex.Title = movie.Name;
                movieSearchIndex.Type = movie.Genre;
                movieSearchIndex.TitleImageURL = posterUrl;
                movieSearchIndex.UniqueName = movie.UniqueName;
                movieSearchIndex.Description = movie.Cast;
                movieSearchIndex.Critics = critics;
                movieSearchIndex.Link = movie.UniqueName;

                LuceneSearch.AddUpdateLuceneIndex(movieSearchIndex);
            });
        }
        private void BuildMovieIndex()
        {
            TableManager tblMgr = new TableManager();
            JavaScriptSerializer json = new JavaScriptSerializer();

            IDictionary<string, MovieEntity> movies = tblMgr.GetAllMovies();

            string posterUrl = string.Empty;

            foreach (MovieEntity movie in movies.Values)
            {
                List<String> posters = json.Deserialize(movie.Posters, typeof(List<String>)) as List<String>;
                List<APIRole.UDT.Cast> casts = json.Deserialize(movie.Cast, typeof(List<APIRole.UDT.Cast>)) as List<APIRole.UDT.Cast>;

                List<string> actors = new List<string>();
                List<string> critics = new List<string>();

                MovieSearchData movieSearchIndex = new MovieSearchData();
                IDictionary<string, ReviewEntity> reviews = tblMgr.GetReviewsByMovieId(movie.MovieId);

                if (posters != null && posters.Count > 0)
                {
                    posterUrl = posters[posters.Count - 1];
                }

                if (reviews != null)
                {
                    foreach (ReviewEntity review in reviews.Values)
                    {
                        if (!string.IsNullOrEmpty(review.ReviewerName))
                            critics.Add(review.ReviewerName);
                    }
                }

                if (casts != null)
                {
                    foreach (var actor in casts)
                    {
                        // actor, director, music, producer
                        string role = actor.role.ToLower();
                        string characterName = string.IsNullOrEmpty(actor.charactername) ? string.Empty : actor.charactername;

                        // Check if artist is already present in the list for some other role.
                        // If yes, skip it. Also if the actor name is missing then skip the artist
                        if (actors.Contains(actor.name) || string.IsNullOrEmpty(actor.name) || actor.name == "null")
                            continue;

                        // If we want to showcase main artists and not all, keep the following switch... case.
                        switch (role)
                        {
                            case "actor":
                                actors.Add(actor.name);
                                break;
                            case "producer":
                                // some times producer are listed as line producer etc.
                                // We are not interested in those artists as of now?! Hence skipping it
                                if (characterName == role)
                                {
                                    actors.Add(actor.name);
                                }
                                break;
                            case "music":
                            case "director":
                                // Main music director and movie director does not have associated character name.
                                // Where as other side directors have associated character name as associate director, assitant director.
                                // Skipping such cases.
                                if (string.IsNullOrEmpty(characterName))
                                {
                                    actors.Add(actor.name);
                                }
                                break;
                        }

                    }
                }

                movieSearchIndex.Id = movie.RowKey;
                movieSearchIndex.Title = movie.Name;
                movieSearchIndex.Type = movie.Genre;

                // Selected poster url
                movieSearchIndex.TitleImageURL = posterUrl;

                movieSearchIndex.UniqueName = movie.UniqueName;
                movieSearchIndex.Description = json.Serialize(actors);
                movieSearchIndex.Critics = json.Serialize(critics);
                movieSearchIndex.Link = movie.UniqueName;
                LuceneSearch.AddUpdateLuceneIndex(movieSearchIndex);
            }
        }
Esempio n. 6
0
 /// <summary>
 /// This is a public method that will add a single record to search index:
 /// </summary>
 /// <param name="movieSearchData"></param>
 public static void AddUpdateLuceneIndex(MovieSearchData movieSearchData)
 {
     AddUpdateLuceneIndex(new MovieSearchData[] { movieSearchData });
 }
Esempio n. 7
0
        /// <summary>
        /// This is a private method that creates a single search index entry based on our data, and it will be reused by public methods.
        /// </summary>
        /// <param name="movieSearchData">Object of MovieSearchData type</param>
        /// <param name="writer"></param>
        private static void _addToLuceneIndex(MovieSearchData movieSearchData, IndexWriter writer)
        {
            // remove older index entry
            var searchQuery = new TermQuery(new Term("Id", movieSearchData.Id.ToString()));
            writer.DeleteDocuments(searchQuery);

            // add new index entry
            var doc = new Document();

            // add lucene fields mapped to db fields
            doc.Add(new Field("Id", movieSearchData.Id.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
            doc.Add(new Field("Title", movieSearchData.Title, Field.Store.YES, Field.Index.ANALYZED));
            doc.Add(new Field("UniqueName", movieSearchData.UniqueName, Field.Store.YES, Field.Index.ANALYZED));
            doc.Add(new Field("TitleImageURL", movieSearchData.TitleImageURL, Field.Store.YES, Field.Index.ANALYZED));
            doc.Add(new Field("Type", movieSearchData.Type, Field.Store.YES, Field.Index.ANALYZED));
            doc.Add(new Field("Link", movieSearchData.Link, Field.Store.YES, Field.Index.ANALYZED));
            doc.Add(new Field("Description", movieSearchData.Description, Field.Store.YES, Field.Index.ANALYZED));
            doc.Add(new Field("Critics", movieSearchData.Critics, Field.Store.YES, Field.Index.ANALYZED));

            // add entry to index
            writer.AddDocument(doc);
        }