/// <summary> /// Makes a complete clone of the epub /// </summary> /// <returns>A full clone of the epub</returns> public Epub Clone() { // copy the zip file to a memory stream and read it Ionic.Zip.ZipFile newFile; MemoryStream ms = new MemoryStream(); file.Save(ms); ms.Position = 0; newFile = Ionic.Zip.ZipFile.Read(ms); Epub clone = new Epub(newFile) { Author = Author, Title = Title, }; clone.Entries = new Dictionary <string, HtmlEntry>(); clone.EntryOrder = new List <HtmlEntry>(); // copy all the entries foreach (var e in EntryOrder) { var clonedEntry = new HtmlEntry() { Href = e.Href, Html = e.Html, MimeType = e.MimeType }; clone.Entries.Add(e.Href, clonedEntry); clone.EntryOrder.Add(clonedEntry); } return(clone); }
/// <summary> /// Analyses the epub by determining all the word entries /// </summary> /// <param name="epub">The epub file</param> /// <returns>A collection of word entries from the entire book</returns> public Dictionary <string, WordEntry> AnalyseEpub(Epub epub) { // get a list of all words by their lower case string value Dictionary <string, List <Word> > wordsOccurences = GetWordsByText(epub); var wordEntries = CreateWordEntriesFromOccurrences(wordsOccurences); return(wordEntries); }
public WordDistributionAnalysis(Epub epub, SortableBindingList <WordEntry> dataSource) { InitializeComponent(); this.epub = epub; // make a copy this.wordEntries = dataSource.OriginalList.ToList(); grid.DataSource = new SortableBindingList <CheckableWordEntry>(wordEntries.Select(we => new CheckableWordEntry(we)).ToList()); }
/// <summary> /// Loads an epub file and analyse all the words /// </summary> /// <param name="path">The full path to the epub file</param> public void OpenEpub(string path) { // cancel any previous loading action and reset the grid and listbox grid.DataSource = null; lstOccurrences.Items.Clear(); loader.CancelAll(); // read the epub file structure Epub epub = Epub.FromFile(path); // change the caption of the form with the filename Text = "Epub spell checker - " + System.IO.Path.GetFileName(path); // analyse the epub async loader.LoadAsync <Dictionary <string, WordEntry> >((state) => { // set progress to marquee state.Text = "Loading epub..."; state.Progress = -1; // get all the word entries in the book var wordEntries = manager.AnalyseEpub(epub); return(wordEntries); }, wes => { // if there was a previously loaded epub, dispose it if (currentEpub != null) { currentEpub.Dispose(); } currentEpub = epub; // bind the word entry list to the datagridview var bs = new SortableBindingList <WordEntry>(wes.Values); grid.DataSource = bs; // update the grid to match the current filter ApplyFilter(false); // update statistics of the word entry list UpdateStatistics(); // continue with loading suggestions for each unknown word FillSuggestions(wes); CheckEditMenuItemAvailibility(); }); }
/// <summary> /// Applies the fixed text of the word entries on the given epub file /// </summary> /// <param name="epub">The epub file to change</param> /// <param name="wordEntries">The word entry collection</param> public void Apply(Epub epub, IEnumerable <WordEntry> wordEntries) { // in order to preserve the character offsets of all the words, the words have to be replaced // in descending order per href entry. // group all the word entries by the href, and then sort them by character offset in descending order. var wordOccurencesByHrefInDescOrder = wordEntries.SelectMany(we => we.Occurrences.Select(occ => new KeyValuePair <WordEntry, Word>(we, occ))) .GroupBy(pair => pair.Value.Href) .ToDictionary(g => g.Key, g => g.OrderByDescending(pair => pair.Value.CharOffset).ToArray()); foreach (var pair in wordOccurencesByHrefInDescOrder) { var href = pair.Key; var wordEntryOccurrencePairs = pair.Value; var te = (Epub.HtmlEntry)epub.Entries[href]; string html = te.Html; // replace the words in the html of the epub entry string replacedHtml = GetReplacedHtml(html, wordEntryOccurrencePairs); te.Html = replacedHtml; } }
/// <summary> /// Gets all the words present and group by their lower string value /// </summary> /// <param name="epub">The epub file</param> /// <returns>A list of words grouped by their lower string value</returns> private Dictionary <string, List <Word> > GetWordsByText(Epub epub) { Dictionary <string, List <Word> > wordsOccurences = new Dictionary <string, List <Word> >(); foreach (var entry in epub.Entries.Values.Where(e => e is Epub.HtmlEntry).Cast <Epub.HtmlEntry>()) { // get all the words from the current text file var words = GetWords(entry.Href, entry.Html); // append the words to the occurence dictionary foreach (var w in words) { List <Word> occurences; if (!wordsOccurences.TryGetValue(w.Text.ToLower(), out occurences)) { wordsOccurences[w.Text.ToLower()] = occurences = new List <Word>(); } occurences.Add(w); } } return(wordsOccurences); }
/// <summary> /// Fully read an epub file to memory and keep the text entries and some general info like Title and Author seperate /// </summary> /// <param name="path">The path of the epub file</param> /// <returns>An epub object read from the given file</returns> public static Epub FromFile(string path) { // read the entire file, and interpret it as a zip file var epubBytes = System.IO.File.ReadAllBytes(path); var file = Ionic.Zip.ZipFile.Read(epubBytes); Epub epub = new Epub(file); // read the metadata container xml info XmlDocument doc = new XmlDocument(); using (MemoryStream ms = new MemoryStream()) { file[@"META-INF\container.xml"].Extract(ms); ms.Position = 0; doc.Load(ms); } // determine the href of the content manifest, which is stored in the full-path attribute of the rootfile tag var node = doc.ChildNodes.GetAllNodes().Where(n => n.Name == "rootfile").FirstOrDefault(); if (node != null) { string contentPath = node.Attributes["full-path"].Value; // keep the relative path to the manifest file, because all entries in the manifest will be relative string basePath = System.IO.Path.GetDirectoryName(contentPath); using (MemoryStream ms = new MemoryStream()) { file[contentPath].Extract(ms); ms.Position = 0; doc = new XmlDocument(); doc.LoadXml(XDocument.Load(ms).Root.StripNamespaces().ToString()); } // read the title if present var titleNode = doc.SelectSingleNode("package/metadata/title"); if (titleNode != null) { epub.Title = titleNode.InnerText; } // read the author if present var authorNode = doc.SelectSingleNode("package/metadata/creator"); if (authorNode != null) { epub.Author = authorNode.InnerText; } // read all the entries in the manifest var items = doc.SelectNodes("package/manifest/item"); Dictionary <string, HtmlEntry> entries = new Dictionary <string, HtmlEntry>(items.Count); var entryOrder = new List <HtmlEntry>(items.Count); foreach (var item in items.Cast <XmlNode>()) { string href = System.IO.Path.Combine(basePath, item.Attributes["href"].Value); string mimeType = item.Attributes["media-type"].Value; // if the entry is a html file if (mimeType == "application/xhtml+xml" || mimeType.Contains("html") || mimeType.Contains("xml")) { // extract the file to a a memory stream and read it to a string using (MemoryStream ms = new MemoryStream()) { file[Uri.UnescapeDataString(href)].Extract(ms); ms.Position = 0; StreamReader reader = new StreamReader(ms); string html = reader.ReadToEnd(); // store the entry var te = new HtmlEntry() { Href = href, MimeType = mimeType, Html = html }; entries.Add(href, te); entryOrder.Add(te); } } } epub.Entries = entries; epub.EntryOrder = entryOrder; } else { throw new Exception("No content metadata"); } return(epub); }