예제 #1
0
        public void FormatJSON(List <JToken> inputJTokens, string outputFile, string masterFilePath, double chapterNum)
        {
            MasterDictionary masterDictionary;

            if (File.Exists(masterFilePath))
            {
                try
                {
                    masterDictionary = JsonConvert.DeserializeObject <MasterDictionary>(File.ReadAllText(masterFilePath));
                }
                catch
                {
                    logger.Error(string.Format("Unable to open the master dictionary at {0}.  Halting.", masterFilePath));
                }
            }

            logger.Info(string.Format("Formatting chapter {0}.", chapterNum));
            int fileNum        = 0;
            int totalJSONFiles = inputJTokens.Count;

            ChapterJSON chapterJSON = new ChapterJSON
            {
                Chapter = chapterNum,
                Pages   = new List <FormattedJSONPage>(),
                MentionedWordChapterLocation = new Dictionary <string, List <int> >()
            };

            foreach (var token in inputJTokens)
            {
                ++fileNum;
                // Deserialize JSON file
                string fileText = token.ToString();
                try
                {
                    ResponseJSON responseJSON = JsonConvert.DeserializeObject <List <ResponseJSON> >(fileText)[0];

                    FormattedJSONPage formattedPage = new FormattedJSONPage
                    {
                        Width  = responseJSON.Width,
                        Height = responseJSON.Height,
                        Words  = new List <FormattedWord>(),
                        Page   = fileNum // We are *going* to trust that that fileNum === pageNum... even though it could easily not be true lol
                    };

                    foreach (var line in responseJSON.Lines)
                    {
                        FormattedWord formattedWord = new FormattedWord
                        {
                            Text        = line.Text,
                            BoundingBox = new List <Coords>()
                        };
                        for (int i = 0; i < 7; i += 2)
                        {
                            formattedWord.BoundingBox.Add(new Coords {
                                X = line.BoundingBox.ElementAt(i), Y = line.BoundingBox.ElementAt(i + 1)
                            });
                        }
                        formattedPage.Words.Add(formattedWord);

                        if (line.Text.EndsWith('-') || line.Text.StartsWith('-'))
                        {
                            FixHyphens(chapterJSON, responseJSON.Lines, line, fileNum);
                        }

                        // In addition, scan words and add to map if needed
                        foreach (var word in line.Words)
                        {
                            var upperWord = RegexFormatAndToUpperWord(word.Text);
                            upperWord = AutoCorrectList(upperWord);

                            if (upperWord.Length > 0)
                            {
                                InsertWordToChapterDict(upperWord, fileNum, chapterJSON);
                            }
                        }
                    }
                    chapterJSON.Pages.Add(formattedPage);
                }
                catch (Exception ex)
                {
                    logger.Error(string.Format("Skipping file {0} of {1}, encountered exception: {2}", fileNum, totalJSONFiles, ex.Message));
                }
            }

            // Write to chapter JSON
            try
            {
                File.WriteAllText(outputFile, JsonConvert.SerializeObject(chapterJSON, Formatting.Indented), System.Text.Encoding.UTF8);
                AddExistingChapterJSONToMasterDictionary(chapterJSON, masterFilePath);
            }
            catch (Exception ex)
            {
                logger.Error(string.Format("Error while writing files: " + ex));
            }
        }
예제 #2
0
        private void FixHyphens(ChapterJSON chapterJSON, FormattedWord hyphenedFormattedWord, int fileNum)
        {
            Console.WriteLine("Hyphened word: " + hyphenedFormattedWord.Text);
            if (!(hyphenedFormattedWord.Text.EndsWith('-') || hyphenedFormattedWord.Text.StartsWith('-')))
            {
                return;
            }

            IReadOnlyList <string> HAS_HYPHEN_LIST = new List <string> {
                "SAMA", "SAN", "SENSEI", "KUN", "CHAN", "SENPAI"
            };
            IReadOnlyList <string> NO_HYPHEN_LIST = new List <string> {
                "ING", "INGS"
            };


            foreach (var candidateLine in chapterJSON.Pages[fileNum - 1].Words)
            {
                // We skip if the coords are the same... as then the line we're looking at is equal to the hyphen line
                if (candidateLine.BoundingBox[0] != hyphenedFormattedWord.BoundingBox[0] && candidateLine.BoundingBox[2] != hyphenedFormattedWord.BoundingBox[2])
                {
                    if (hyphenedFormattedWord.Text.EndsWith('-'))
                    {
                        if (hyphenedFormattedWord.BoundingBox[0].Y < candidateLine.BoundingBox[0].Y)
                        {
                            logger.Debug("Word: " + candidateLine.Text);
                            if (DoIntersect(hyphenedFormattedWord.BoundingBox[0], hyphenedFormattedWord.BoundingBox[2], candidateLine.BoundingBox[0], candidateLine.BoundingBox[2], Math.Abs(hyphenedFormattedWord.BoundingBox[0].Y - hyphenedFormattedWord.BoundingBox[3].Y)))
                            {
                                if (!NO_HYPHEN_LIST.Contains(AutoCorrectList(TrimAllHyphens(RegexFormatAndToUpperWord(candidateLine.Text.Split(' ').First())))))
                                {
                                    string hyphenedWord      = AutoCorrectList(hyphenedFormattedWord.Text.Split(' ').Last()) + AutoCorrectList(RegexFormatAndToUpperWord(candidateLine.Text.Split(' ').First()));
                                    string hyphenedUpperWord = RegexFormatAndToUpperWord(hyphenedWord);
                                    if (hyphenedUpperWord.Length > 0)
                                    {
                                        Console.WriteLine("Inserting: " + hyphenedUpperWord);
                                        InsertWordToChapterDict(hyphenedUpperWord, fileNum, chapterJSON);
                                    }
                                }

                                // Skip if the word MUST contain a hyphen!
                                if (!HAS_HYPHEN_LIST.Contains(AutoCorrectList(TrimAllHyphens(RegexFormatAndToUpperWord(candidateLine.Text.Split(' ').First())))))
                                {
                                    string unHyphenedWord      = AutoCorrectList(RegexFormatAndToUpperWord(hyphenedFormattedWord.Text.Split(' ').Last())) + AutoCorrectList(RegexFormatAndToUpperWord(candidateLine.Text.Split(' ').First()));
                                    string unHyphenedUpperWord = RegexFormatAndToUpperWord(unHyphenedWord);

                                    if (unHyphenedUpperWord.Length > 0)
                                    {
                                        Console.WriteLine("Inserting no-hyphen: " + unHyphenedUpperWord);
                                        InsertWordToChapterDict(unHyphenedUpperWord, fileNum, chapterJSON);
                                    }
                                }

                                break;
                            }
                        }
                    }
                    else
                    {
                        if (hyphenedFormattedWord.BoundingBox[0].Y > candidateLine.BoundingBox[0].Y)
                        {
                            if (DoIntersect(candidateLine.BoundingBox[0], candidateLine.BoundingBox[2], hyphenedFormattedWord.BoundingBox[0], hyphenedFormattedWord.BoundingBox[2], Math.Abs(hyphenedFormattedWord.BoundingBox[0].Y - hyphenedFormattedWord.BoundingBox[3].Y)))
                            {
                                if (!NO_HYPHEN_LIST.Contains(AutoCorrectList(TrimAllHyphens(RegexFormatAndToUpperWord(hyphenedFormattedWord.Text.Split(' ').First())))))
                                {
                                    string hyphenedWord      = AutoCorrectList(candidateLine.Text.Split(' ').Last()) + AutoCorrectList(RegexFormatAndToUpperWord(hyphenedFormattedWord.Text.Split(' ').First()));
                                    string hyphenedUpperWord = RegexFormatAndToUpperWord(hyphenedWord);
                                    if (hyphenedUpperWord.Length > 0)
                                    {
                                        Console.WriteLine("Inserting: " + hyphenedUpperWord);
                                        InsertWordToChapterDict(hyphenedUpperWord, fileNum, chapterJSON);
                                    }
                                }

                                // Skip if the word MUST contain a hyphen!
                                if (!HAS_HYPHEN_LIST.Contains(AutoCorrectList(TrimAllHyphens(RegexFormatAndToUpperWord(hyphenedFormattedWord.Text.Split(' ').First())))))
                                {
                                    string unHyphenedWord      = AutoCorrectList(RegexFormatAndToUpperWord(candidateLine.Text.Split(' ').Last())) + AutoCorrectList(TrimAllHyphens(RegexFormatAndToUpperWord(hyphenedFormattedWord.Text.Split(' ').First())));
                                    string unHyphenedUpperWord = RegexFormatAndToUpperWord(unHyphenedWord);

                                    if (unHyphenedUpperWord.Length > 0)
                                    {
                                        Console.WriteLine("Inserting: " + unHyphenedUpperWord);
                                        InsertWordToChapterDict(unHyphenedUpperWord, fileNum, chapterJSON);
                                    }
                                }

                                break;
                            }
                        }
                    }
                }
            }
        }