public void FormatJSON(List <JToken> inputJTokens, string outputFile, string masterFilePath, double chapterNum) { MasterDictionary masterDictionary; if (File.Exists(masterFilePath)) { try { masterDictionary = JsonConvert.DeserializeObject <MasterDictionary>(File.ReadAllText(masterFilePath)); } catch { logger.Error(string.Format("Unable to open the master dictionary at {0}. Halting.", masterFilePath)); } } logger.Info(string.Format("Formatting chapter {0}.", chapterNum)); int fileNum = 0; int totalJSONFiles = inputJTokens.Count; ChapterJSON chapterJSON = new ChapterJSON { Chapter = chapterNum, Pages = new List <FormattedJSONPage>(), MentionedWordChapterLocation = new Dictionary <string, List <int> >() }; foreach (var token in inputJTokens) { ++fileNum; // Deserialize JSON file string fileText = token.ToString(); try { ResponseJSON responseJSON = JsonConvert.DeserializeObject <List <ResponseJSON> >(fileText)[0]; FormattedJSONPage formattedPage = new FormattedJSONPage { Width = responseJSON.Width, Height = responseJSON.Height, Words = new List <FormattedWord>(), Page = fileNum // We are *going* to trust that that fileNum === pageNum... even though it could easily not be true lol }; foreach (var line in responseJSON.Lines) { FormattedWord formattedWord = new FormattedWord { Text = line.Text, BoundingBox = new List <Coords>() }; for (int i = 0; i < 7; i += 2) { formattedWord.BoundingBox.Add(new Coords { X = line.BoundingBox.ElementAt(i), Y = line.BoundingBox.ElementAt(i + 1) }); } formattedPage.Words.Add(formattedWord); if (line.Text.EndsWith('-') || line.Text.StartsWith('-')) { FixHyphens(chapterJSON, responseJSON.Lines, line, fileNum); } // In addition, scan words and add to map if needed foreach (var word in line.Words) { var upperWord = RegexFormatAndToUpperWord(word.Text); upperWord = AutoCorrectList(upperWord); if (upperWord.Length > 0) { InsertWordToChapterDict(upperWord, fileNum, chapterJSON); } } } chapterJSON.Pages.Add(formattedPage); } catch (Exception ex) { logger.Error(string.Format("Skipping file {0} of {1}, encountered exception: {2}", fileNum, totalJSONFiles, ex.Message)); } } // Write to chapter JSON try { File.WriteAllText(outputFile, JsonConvert.SerializeObject(chapterJSON, Formatting.Indented), System.Text.Encoding.UTF8); AddExistingChapterJSONToMasterDictionary(chapterJSON, masterFilePath); } catch (Exception ex) { logger.Error(string.Format("Error while writing files: " + ex)); } }
private void FixHyphens(ChapterJSON chapterJSON, FormattedWord hyphenedFormattedWord, int fileNum) { Console.WriteLine("Hyphened word: " + hyphenedFormattedWord.Text); if (!(hyphenedFormattedWord.Text.EndsWith('-') || hyphenedFormattedWord.Text.StartsWith('-'))) { return; } IReadOnlyList <string> HAS_HYPHEN_LIST = new List <string> { "SAMA", "SAN", "SENSEI", "KUN", "CHAN", "SENPAI" }; IReadOnlyList <string> NO_HYPHEN_LIST = new List <string> { "ING", "INGS" }; foreach (var candidateLine in chapterJSON.Pages[fileNum - 1].Words) { // We skip if the coords are the same... as then the line we're looking at is equal to the hyphen line if (candidateLine.BoundingBox[0] != hyphenedFormattedWord.BoundingBox[0] && candidateLine.BoundingBox[2] != hyphenedFormattedWord.BoundingBox[2]) { if (hyphenedFormattedWord.Text.EndsWith('-')) { if (hyphenedFormattedWord.BoundingBox[0].Y < candidateLine.BoundingBox[0].Y) { logger.Debug("Word: " + candidateLine.Text); if (DoIntersect(hyphenedFormattedWord.BoundingBox[0], hyphenedFormattedWord.BoundingBox[2], candidateLine.BoundingBox[0], candidateLine.BoundingBox[2], Math.Abs(hyphenedFormattedWord.BoundingBox[0].Y - hyphenedFormattedWord.BoundingBox[3].Y))) { if (!NO_HYPHEN_LIST.Contains(AutoCorrectList(TrimAllHyphens(RegexFormatAndToUpperWord(candidateLine.Text.Split(' ').First()))))) { string hyphenedWord = AutoCorrectList(hyphenedFormattedWord.Text.Split(' ').Last()) + AutoCorrectList(RegexFormatAndToUpperWord(candidateLine.Text.Split(' ').First())); string hyphenedUpperWord = RegexFormatAndToUpperWord(hyphenedWord); if (hyphenedUpperWord.Length > 0) { Console.WriteLine("Inserting: " + hyphenedUpperWord); InsertWordToChapterDict(hyphenedUpperWord, fileNum, chapterJSON); } } // Skip if the word MUST contain a hyphen! if (!HAS_HYPHEN_LIST.Contains(AutoCorrectList(TrimAllHyphens(RegexFormatAndToUpperWord(candidateLine.Text.Split(' ').First()))))) { string unHyphenedWord = AutoCorrectList(RegexFormatAndToUpperWord(hyphenedFormattedWord.Text.Split(' ').Last())) + AutoCorrectList(RegexFormatAndToUpperWord(candidateLine.Text.Split(' ').First())); string unHyphenedUpperWord = RegexFormatAndToUpperWord(unHyphenedWord); if (unHyphenedUpperWord.Length > 0) { Console.WriteLine("Inserting no-hyphen: " + unHyphenedUpperWord); InsertWordToChapterDict(unHyphenedUpperWord, fileNum, chapterJSON); } } break; } } } else { if (hyphenedFormattedWord.BoundingBox[0].Y > candidateLine.BoundingBox[0].Y) { if (DoIntersect(candidateLine.BoundingBox[0], candidateLine.BoundingBox[2], hyphenedFormattedWord.BoundingBox[0], hyphenedFormattedWord.BoundingBox[2], Math.Abs(hyphenedFormattedWord.BoundingBox[0].Y - hyphenedFormattedWord.BoundingBox[3].Y))) { if (!NO_HYPHEN_LIST.Contains(AutoCorrectList(TrimAllHyphens(RegexFormatAndToUpperWord(hyphenedFormattedWord.Text.Split(' ').First()))))) { string hyphenedWord = AutoCorrectList(candidateLine.Text.Split(' ').Last()) + AutoCorrectList(RegexFormatAndToUpperWord(hyphenedFormattedWord.Text.Split(' ').First())); string hyphenedUpperWord = RegexFormatAndToUpperWord(hyphenedWord); if (hyphenedUpperWord.Length > 0) { Console.WriteLine("Inserting: " + hyphenedUpperWord); InsertWordToChapterDict(hyphenedUpperWord, fileNum, chapterJSON); } } // Skip if the word MUST contain a hyphen! if (!HAS_HYPHEN_LIST.Contains(AutoCorrectList(TrimAllHyphens(RegexFormatAndToUpperWord(hyphenedFormattedWord.Text.Split(' ').First()))))) { string unHyphenedWord = AutoCorrectList(RegexFormatAndToUpperWord(candidateLine.Text.Split(' ').Last())) + AutoCorrectList(TrimAllHyphens(RegexFormatAndToUpperWord(hyphenedFormattedWord.Text.Split(' ').First()))); string unHyphenedUpperWord = RegexFormatAndToUpperWord(unHyphenedWord); if (unHyphenedUpperWord.Length > 0) { Console.WriteLine("Inserting: " + unHyphenedUpperWord); InsertWordToChapterDict(unHyphenedUpperWord, fileNum, chapterJSON); } } break; } } } } } }