/// <summary> /// returns a list of OmdbEntrys that can be combines with imdb data to form a complete ombd entry /// </summary> /// <param name="filepath"></param> /// <returns></returns> public static List <OmdbEntry> ParseTSVforTomatoesData(string filepath) { Tools.TraceLine("Start Parsing for Tomatoes"); List <OmdbEntry> omdbEntry_list; //omdbEntry_list = existing_list ?? new List<OmdbEntry>(); omdbEntry_list = new List <OmdbEntry>(); using ( CsvReader csvReader = new CsvReader(new StreamReader(filepath), true, '\t', '~', '`', '~', ValueTrimmingOptions.None)) { int count = 0; //loop over all the rows until fail, adding the created entry to list while (csvReader.ReadNextRecord()) { var entry = Omdb.CreateOmdbEntryFromTsvRecord(tomReader: csvReader); omdbEntry_list.Add(entry); //Tools.TraceLine(count.ToString()); count++; } Tools.TraceLine("Done Parsing for Tomatoes"); return(omdbEntry_list); } }
/// <summary> /// Parses the TSV with the IMDB movie data and gets saved to the db /// </summary> /// <param name="imdb_filepath">the path to the omdb.txt with the IMDB movie data</param> /// <param name="numOfMoviesPerLoop">5000 is about 180MB, 10000 was around 240MB 15000 can be as high as 300MB</param> private static void OptimizedImdbTsvParse(string imdb_filepath, int numOfMoviesPerLoop = 5000) { //build a list of hashes MovieDbContext tempDb = new MovieDbContext(); //List<int> omdbHashes = // tempDb.Omdb.Select(item => item.GetHashCode()).ToList(); //create a list of hashes for the omdbentrys in the db List <int> omdbHashes = new List <int>(); foreach (OmdbEntry omdb in tempDb.Omdb) { omdbHashes.Add(omdb.GetHashCode()); } var omdbDistinctHashes = omdbHashes .Distinct() .ToList(); //read numOfMoviesPerLoop movies until all the IMDB movies are parsed //5000 is about 180MB, 10000 was around 240MB 15000 can be as high as 300MB " using ( CsvReader imdb_csvReader = new CsvReader(new StreamReader(imdb_filepath), true, '\t', '~', '`', '~', ValueTrimmingOptions.None)) { while (imdb_csvReader.ReadNextRecord()) { //loop through the imdbtsv, creating a omdbentry for the first 500 items List <OmdbEntry> small_omdbEntry_list = new List <OmdbEntry>(); for (int i = 0; i < numOfMoviesPerLoop; i++) { //read the row and create an omdb from it var entry = Omdb.CreateOmdbEntryFromTsvRecord( imdbReader: imdb_csvReader); small_omdbEntry_list.Add(entry); //if nothing left to read, break out of the loop if (imdb_csvReader.ReadNextRecord() == false) { Tools.TraceLine( "ReadNextRecord was false, breaking out of loop to save it"); break; } } //save the small_omdbEntry_list to db, but check for dupes first MovieDbContext db = new MovieDbContext(); db.Configuration.AutoDetectChangesEnabled = false; //for each omdbentry hash not in the list of hashes, add to db foreach ( OmdbEntry omdbEntry in small_omdbEntry_list.Where( omdbEntry => !omdbHashes.Contains(omdbEntry.GetHashCode()))) { db.Omdb.Add(omdbEntry); } Tools.TraceLine( "saving omdbs. # of omdbs in table before save: {0}", db.Omdb.Count()); db.SaveChanges(); db.Configuration.AutoDetectChangesEnabled = true; } Tools.TraceLine("Done saving IMDB OmdbEntrys"); } }
/// <summary> /// Parsed a TSV for RT data then adds the new data to existing IMDB OmdbEntrys in the db /// </summary> /// <param name="tom_filepath"></param> public static void OptimizedRtTsvParse(string tom_filepath) { //i don't think this is necessary, because you always want to update the RT data ////build a list of hashes //MovieDbContext tempDb = new MovieDbContext(); //List<int> omdbHashes = // tempDb.Omdb.Select(item => item.GetHashCode()) // .Distinct() // .ToList(); int num_of_RT_movies_per_loop = 5000; using ( CsvReader tom_csvReader = new CsvReader(new StreamReader(tom_filepath), true, '\t', '~', '`', '~', ValueTrimmingOptions.None)) { while (tom_csvReader.ReadNextRecord()) { //loop through the imdbtsv, creating a omdbentry for the first 5000 items List <OmdbEntry> new_tom_omdb_entries = new List <OmdbEntry>(); for (int i = 0; i < num_of_RT_movies_per_loop; i++) { //read the row and create an omdb from it parse the current TSV row var entry = Omdb.CreateOmdbEntryFromTsvRecord(tomReader: tom_csvReader); // add entry to a list new_tom_omdb_entries.Add(entry); //if nothing left to read, break out of the loop if (tom_csvReader.ReadNextRecord() == false) { Tools.TraceLine( "ReadNextRecord was false, breaking out of loop to save it"); break; } } //find all existing IMDB entries in db MovieDbContext db = new MovieDbContext(); db.Configuration.AutoDetectChangesEnabled = true; //find all existing OE that match the omdb_ids of the listed ones Tools.TraceLine("items in db.Omdb {0}", db.Omdb.Count()); //get the omdb_ids of the new RT omdbentrys List <int> tom_omdb_ids_to_match = new_tom_omdb_entries.Select(omdb => omdb.ombd_ID) .ToList(); //match the ids to the exist IMDB omdbentries var res = (from imdb in db.Omdb where tom_omdb_ids_to_match.Contains(imdb.ombd_ID) select imdb); //Tools.TraceLine("items in res {0}", res.Count()); List <OmdbEntry> matched_existing_imdb_omdbentrys = res.ToList(); //alter the existing IMDB entries and save the changes... foreach ( OmdbEntry matchedExistingImdbOmdbentry in matched_existing_imdb_omdbentrys) { //the RT omdb tha matches the imdb entry for the omdb_id var matching_RT_data = new_tom_omdb_entries.First( item => item.ombd_ID == matchedExistingImdbOmdbentry.ombd_ID); //updates the IMDB OmdbEntry with the RT OmdbEntry's information UpdateImdbEntryWithRtEntry(matchedExistingImdbOmdbentry, matching_RT_data); } //save the updated OmdbEntry information, dispose of the context db.SaveChanges(); db.Dispose(); Tools.TraceLine("Updated existing IMDB OmdbEntrys, done saving"); } } }