Beispiel #1
0
        /// <summary>
        /// returns a list of OmdbEntrys that can be combines with imdb data to form a complete ombd entry
        /// </summary>
        /// <param name="filepath"></param>
        /// <returns></returns>
        public static List <OmdbEntry> ParseTSVforTomatoesData(string filepath)
        {
            Tools.TraceLine("Start Parsing for Tomatoes");
            List <OmdbEntry> omdbEntry_list;

            //omdbEntry_list = existing_list ?? new List<OmdbEntry>();
            omdbEntry_list = new List <OmdbEntry>();

            using (
                CsvReader csvReader = new CsvReader(new StreamReader(filepath),
                                                    true, '\t', '~', '`', '~',
                                                    ValueTrimmingOptions.None)) {
                int count = 0;
                //loop over all the rows until fail, adding the created entry to list
                while (csvReader.ReadNextRecord())
                {
                    var entry =
                        Omdb.CreateOmdbEntryFromTsvRecord(tomReader: csvReader);
                    omdbEntry_list.Add(entry);
                    //Tools.TraceLine(count.ToString());
                    count++;
                }
                Tools.TraceLine("Done Parsing for Tomatoes");
                return(omdbEntry_list);
            }
        }
Beispiel #2
0
        /// <summary>
        /// Parses the TSV with the IMDB movie data and gets saved to the db
        /// </summary>
        /// <param name="imdb_filepath">the path to the omdb.txt with the IMDB movie data</param>
        /// <param name="numOfMoviesPerLoop">5000 is about 180MB, 10000 was around 240MB 15000 can be as high as 300MB</param>
        private static void OptimizedImdbTsvParse(string imdb_filepath, int numOfMoviesPerLoop = 5000)
        {
            //build a list of hashes
            MovieDbContext tempDb = new MovieDbContext();
            //List<int> omdbHashes =
            //    tempDb.Omdb.Select(item => item.GetHashCode()).ToList();
            //create a list of hashes for the omdbentrys in the db
            List <int> omdbHashes = new List <int>();

            foreach (OmdbEntry omdb in tempDb.Omdb)
            {
                omdbHashes.Add(omdb.GetHashCode());
            }
            var omdbDistinctHashes = omdbHashes
                                     .Distinct()
                                     .ToList();

            //read numOfMoviesPerLoop movies until all the IMDB movies are parsed
            //5000 is about 180MB, 10000 was around 240MB 15000 can be as high as 300MB "
            using (
                CsvReader imdb_csvReader =
                    new CsvReader(new StreamReader(imdb_filepath), true, '\t',
                                  '~', '`', '~', ValueTrimmingOptions.None)) {
                while (imdb_csvReader.ReadNextRecord())
                {
                    //loop through the imdbtsv, creating a omdbentry for the first 500 items
                    List <OmdbEntry> small_omdbEntry_list = new List <OmdbEntry>();
                    for (int i = 0; i < numOfMoviesPerLoop; i++)
                    {
                        //read the row and create an omdb from it
                        var entry =
                            Omdb.CreateOmdbEntryFromTsvRecord(
                                imdbReader: imdb_csvReader);
                        small_omdbEntry_list.Add(entry);

                        //if nothing left to read, break out of the loop
                        if (imdb_csvReader.ReadNextRecord() == false)
                        {
                            Tools.TraceLine(
                                "ReadNextRecord was false, breaking out of loop to save it");
                            break;
                        }
                    }

                    //save the small_omdbEntry_list to db, but check for dupes first
                    MovieDbContext db = new MovieDbContext();
                    db.Configuration.AutoDetectChangesEnabled = false;



                    //for each omdbentry hash not in the list of hashes, add to db
                    foreach (
                        OmdbEntry omdbEntry in
                        small_omdbEntry_list.Where(
                            omdbEntry =>
                            !omdbHashes.Contains(omdbEntry.GetHashCode())))
                    {
                        db.Omdb.Add(omdbEntry);
                    }

                    Tools.TraceLine(
                        "saving omdbs. # of omdbs in table before save: {0}",
                        db.Omdb.Count());
                    db.SaveChanges();
                    db.Configuration.AutoDetectChangesEnabled = true;
                }

                Tools.TraceLine("Done saving IMDB OmdbEntrys");
            }
        }
Beispiel #3
0
        /// <summary>
        /// Parsed a TSV for RT data then adds the new data to existing IMDB OmdbEntrys in the db
        /// </summary>
        /// <param name="tom_filepath"></param>
        public static void OptimizedRtTsvParse(string tom_filepath)
        {
            //i don't think this is necessary, because you always want to update the RT data
            ////build a list of hashes
            //MovieDbContext tempDb = new MovieDbContext();
            //List<int> omdbHashes =
            //    tempDb.Omdb.Select(item => item.GetHashCode())
            //      .Distinct()
            //      .ToList();

            int num_of_RT_movies_per_loop = 5000;

            using (
                CsvReader tom_csvReader =
                    new CsvReader(new StreamReader(tom_filepath), true, '\t',
                                  '~', '`', '~', ValueTrimmingOptions.None)) {
                while (tom_csvReader.ReadNextRecord())
                {
                    //loop through the imdbtsv, creating a omdbentry for the first 5000 items
                    List <OmdbEntry> new_tom_omdb_entries = new List <OmdbEntry>();
                    for (int i = 0; i < num_of_RT_movies_per_loop; i++)
                    {
                        //read the row and create an omdb from it parse the current TSV row
                        var entry =
                            Omdb.CreateOmdbEntryFromTsvRecord(tomReader: tom_csvReader);
                        // add entry to a list
                        new_tom_omdb_entries.Add(entry);

                        //if nothing left to read, break out of the loop
                        if (tom_csvReader.ReadNextRecord() == false)
                        {
                            Tools.TraceLine(
                                "ReadNextRecord was false, breaking out of loop to save it");
                            break;
                        }
                    }

                    //find all existing IMDB entries in db
                    MovieDbContext db = new MovieDbContext();
                    db.Configuration.AutoDetectChangesEnabled = true;
                    //find all existing OE that match the omdb_ids of the listed ones
                    Tools.TraceLine("items in db.Omdb {0}", db.Omdb.Count());

                    //get the omdb_ids of the new RT omdbentrys
                    List <int> tom_omdb_ids_to_match =
                        new_tom_omdb_entries.Select(omdb => omdb.ombd_ID)
                        .ToList();
                    //match the ids to the exist IMDB omdbentries
                    var res = (from imdb in db.Omdb
                               where tom_omdb_ids_to_match.Contains(imdb.ombd_ID)
                               select imdb);
                    //Tools.TraceLine("items in res {0}", res.Count());
                    List <OmdbEntry> matched_existing_imdb_omdbentrys = res.ToList();


                    //alter the existing IMDB entries and save the changes...
                    foreach (
                        OmdbEntry matchedExistingImdbOmdbentry in
                        matched_existing_imdb_omdbentrys)
                    {
                        //the RT omdb tha matches the imdb entry for the omdb_id
                        var matching_RT_data =
                            new_tom_omdb_entries.First(
                                item =>
                                item.ombd_ID ==
                                matchedExistingImdbOmdbentry.ombd_ID);

                        //updates the IMDB OmdbEntry with the RT OmdbEntry's information
                        UpdateImdbEntryWithRtEntry(matchedExistingImdbOmdbentry,
                                                   matching_RT_data);
                    }

                    //save the updated OmdbEntry information, dispose of the context
                    db.SaveChanges();
                    db.Dispose();
                    Tools.TraceLine("Updated existing IMDB OmdbEntrys, done saving");
                }
            }
        }