public static BookNavigationData BookNavigationDataEnsure(BookDataContext bookdb, BookData bookData) { var nd = CommonQueries.BookNavigationDataFind(bookdb, bookData.BookId); if (nd == null) { nd = new BookNavigationData() { BookId = bookData.BookId, }; CommonQueries.BookNavigationDataAdd(bookdb, nd, CommonQueries.ExistHandling.IfNotExists); nd = CommonQueries.BookNavigationDataFind(bookdb, bookData.BookId); CommonQueries.BookSaveChanges(bookdb); } if (nd == null) { App.Error($"ERROR: trying to ensure navigation data, but don't have one."); } return(nd); }
public static async Task <int> ReadZipTarRdfFileAsync(IndexReader ui, BookDataContext bookdb, Windows.Storage.StorageFile file, CancellationToken token, UpdateType updateType = UpdateType.Full) { SaveAfterNFiles = SaveSkipCount; UiAfterNNodes = NodeReadCount; // FAIL: Gutenberg includes bad files HashSet <string> KnownBadFiles = new HashSet <string>() { "cache/epub/0/pg0.rdf", "cache/epub/999999/pg999999.rdf", }; var startTime = DateTime.Now; int nnewfiles = 0; int nnodes = 0; List <BookData> newBooks = new List <BookData>(); try { using (var stream = await file.OpenAsync(Windows.Storage.FileAccessMode.Read)) { using (var reader = ReaderFactory.Open(stream.AsStream())) { while (reader.MoveToNextEntry()) { if (token.IsCancellationRequested) { break; } System.Diagnostics.Debug.WriteLine($"ZIPREAD: {reader.Entry.Key} size {reader.Entry.Size}"); // Is the rdf-files.tar file that Gutenberg uses. // The zip file has one giant TAR file (rdf-files.tar) embedded in it. if (reader.Entry.Key.EndsWith(".tar")) { using (var tarStream = reader.OpenEntryStream()) { using (var tarReader = ReaderFactory.Open(tarStream)) { while (tarReader.MoveToNextEntry()) { MemoryStream ms = new MemoryStream((int)tarReader.Entry.Size); tarReader.WriteEntryTo(ms); ms.Position = 0; var sr = new StreamReader(ms); var text = sr.ReadToEnd(); nnodes++; if (token.IsCancellationRequested) { break; } if (KnownBadFiles.Contains(tarReader.Entry.Key)) { // Skip known bad files like entry 999999 -- has weird values for lots of stuff! } else { // Got a book; let the UI know. newBooks.Clear(); if (tarReader.Entry.Key.Contains("62548")) { ; // useful hook for debugging. } // Reads and saves to database. And does a fancy merge if needed. int newCount = 0; try { newCount = Read(bookdb, tarReader.Entry.Key, text, newBooks, updateType); } catch (Exception rdfex) { // Do what on exception? Log($"Error: file {file.Name} name {tarReader.Entry.Key} exception {rdfex.Message}"); newCount = 0; } nnewfiles += newCount; if (nnewfiles > 6000 && nnewfiles < 9000) { SaveSkipCount = 100; } else { SaveSkipCount = 100; // save very frequently. Otherwise, ka-boom! } if (nnewfiles >= SaveAfterNFiles) { // FAIL: must save periodically. Can't accumulate a large number // of books (e..g, 60K books in the catalog) and then save all at // once; it will take up too much memory and will crash. Log($"At index {CommonQueries.BookCount(bookdb)} file {file.Name} nfiles {nnewfiles}"); CommonQueries.BookSaveChanges(bookdb); // Try resetting the singleton to reduce the number of crashes. BookDataContext.ResetSingleton("InitialBookData.Db"); await Task.Delay(100); // Try a pause to reduce crashes. SaveAfterNFiles += SaveSkipCount; } if (newCount > 0) { foreach (var bookData in newBooks) { await ui.OnAddNewBook(bookData); } } if (nnodes >= UiAfterNNodes) { //await ui.LogAsync($"Book: file {tarReader.Entry.Key}\nNNew: {nfiles} NProcesses {nnodes}\n"); await ui.OnTotalBooks(nnodes); UiAfterNNodes += NodeReadCount; } } } } } } } } } } catch (Exception readEx) { Log($"Error: reading Gutenberg ZIP file exception {readEx.Message}"); ; // something bad happened. } await ui.OnReadComplete(nnodes, nnewfiles); var delta = DateTime.Now.Subtract(startTime).TotalSeconds; System.Diagnostics.Debug.WriteLine($"READ: {nnewfiles} in {delta} seconds = {nnewfiles / delta} fps or {delta / nnewfiles * 1000} ms per file"); CommonQueries.BookSaveChanges(bookdb); // Woot, woot! I've got good book data! return(nnewfiles); }