Example #1
0
        private void RegenerateJSON(string jsonFilePath, ChapterJSON chapterJSON)
        {
            // Clear existing dictionary
            chapterJSON.MentionedWordChapterLocation = new Dictionary <string, List <int> >();

            // Regenerate...
            logger.Info(string.Format("Regenerating: {0}...", jsonFilePath));
            foreach (var page in chapterJSON.Pages)
            {
                foreach (var words in page.Words)
                {
                    if (words.Text.EndsWith('-') || words.Text.StartsWith('-'))
                    {
                        FixHyphens(chapterJSON, words, page.Page);
                    }

                    var listOfWords = words.Text.Split(' ');
                    foreach (var word in listOfWords)
                    {
                        var upperWord = RegexFormatAndToUpperWord(word);
                        if (upperWord.Length > 0)
                        {
                            InsertWordToChapterDict(upperWord, page.Page, chapterJSON);
                        }
                    }
                }
            }

            // Rewrite!
            logger.Info(string.Format("Overwriting {0}...", jsonFilePath));
            File.WriteAllText(jsonFilePath, JsonConvert.SerializeObject(chapterJSON, Formatting.Indented), System.Text.Encoding.UTF8); // WILL OVERWRITE!
        }
Example #2
0
        public void AddJSONToDictionary(string chapterJSONFile, string masterFilePath)
        {
            if (!File.Exists(chapterJSONFile))
            {
                logger.Error("File was missing.");
                return;
            }

            ChapterJSON chapterJSON = JsonConvert.DeserializeObject <ChapterJSON>(File.ReadAllText(chapterJSONFile));

            AddExistingChapterJSONToMasterDictionary(chapterJSON, masterFilePath);
        }
Example #3
0
 private void InsertWordToChapterDict(string upperWord, int fileNum, ChapterJSON chapterJSON)
 {
     if (chapterJSON.MentionedWordChapterLocation.ContainsKey(upperWord))
     {
         if (!chapterJSON.MentionedWordChapterLocation[upperWord].Contains(fileNum))
         {
             chapterJSON.MentionedWordChapterLocation[upperWord].Add(fileNum);
         }
     }
     else
     {
         var newListOfPages = new List <int>
         {
             fileNum
         };
         chapterJSON.MentionedWordChapterLocation.Add(upperWord, newListOfPages);
     }
 }
Example #4
0
        /// <summary>
        /// Used to update existing chapter directories and thus the master dictionary after I update functionality.
        /// The way this works is it basically destroys the chapter's existing master dict and regens it using the (newer) existing rules.
        /// </summary>
        /// <param name="jsonFileDir"></param>
        /// <param name="masterFile"></param>
        public void RegenerateExistingJSON(string jsonFileDir, string masterFile)
        {
            if (Directory.Exists(jsonFileDir))
            {
                List <string> jsonPaths = new List <string>(Directory.EnumerateFiles(jsonFileDir));
                File.Delete(masterFile);

                foreach (var jsonPath in jsonPaths)
                {
                    ChapterJSON chapterJSON = JsonConvert.DeserializeObject <ChapterJSON>(File.ReadAllText(jsonPath));
                    RegenerateJSON(jsonPath, chapterJSON);
                }
                BulkAddJSONToDictionary(jsonFileDir, masterFile);
            }
            else
            {
                logger.Error("Directory does not exist.");
            }
        }
Example #5
0
        public void FormatJSON(List <JToken> inputJTokens, string outputFile, string masterFilePath, double chapterNum)
        {
            MasterDictionary masterDictionary;

            if (File.Exists(masterFilePath))
            {
                try
                {
                    masterDictionary = JsonConvert.DeserializeObject <MasterDictionary>(File.ReadAllText(masterFilePath));
                }
                catch
                {
                    logger.Error(string.Format("Unable to open the master dictionary at {0}.  Halting.", masterFilePath));
                }
            }

            logger.Info(string.Format("Formatting chapter {0}.", chapterNum));
            int fileNum        = 0;
            int totalJSONFiles = inputJTokens.Count;

            ChapterJSON chapterJSON = new ChapterJSON
            {
                Chapter = chapterNum,
                Pages   = new List <FormattedJSONPage>(),
                MentionedWordChapterLocation = new Dictionary <string, List <int> >()
            };

            foreach (var token in inputJTokens)
            {
                ++fileNum;
                // Deserialize JSON file
                string fileText = token.ToString();
                try
                {
                    ResponseJSON responseJSON = JsonConvert.DeserializeObject <List <ResponseJSON> >(fileText)[0];

                    FormattedJSONPage formattedPage = new FormattedJSONPage
                    {
                        Width  = responseJSON.Width,
                        Height = responseJSON.Height,
                        Words  = new List <FormattedWord>(),
                        Page   = fileNum // We are *going* to trust that that fileNum === pageNum... even though it could easily not be true lol
                    };

                    foreach (var line in responseJSON.Lines)
                    {
                        FormattedWord formattedWord = new FormattedWord
                        {
                            Text        = line.Text,
                            BoundingBox = new List <Coords>()
                        };
                        for (int i = 0; i < 7; i += 2)
                        {
                            formattedWord.BoundingBox.Add(new Coords {
                                X = line.BoundingBox.ElementAt(i), Y = line.BoundingBox.ElementAt(i + 1)
                            });
                        }
                        formattedPage.Words.Add(formattedWord);

                        if (line.Text.EndsWith('-') || line.Text.StartsWith('-'))
                        {
                            FixHyphens(chapterJSON, responseJSON.Lines, line, fileNum);
                        }

                        // In addition, scan words and add to map if needed
                        foreach (var word in line.Words)
                        {
                            var upperWord = RegexFormatAndToUpperWord(word.Text);
                            upperWord = AutoCorrectList(upperWord);

                            if (upperWord.Length > 0)
                            {
                                InsertWordToChapterDict(upperWord, fileNum, chapterJSON);
                            }
                        }
                    }
                    chapterJSON.Pages.Add(formattedPage);
                }
                catch (Exception ex)
                {
                    logger.Error(string.Format("Skipping file {0} of {1}, encountered exception: {2}", fileNum, totalJSONFiles, ex.Message));
                }
            }

            // Write to chapter JSON
            try
            {
                File.WriteAllText(outputFile, JsonConvert.SerializeObject(chapterJSON, Formatting.Indented), System.Text.Encoding.UTF8);
                AddExistingChapterJSONToMasterDictionary(chapterJSON, masterFilePath);
            }
            catch (Exception ex)
            {
                logger.Error(string.Format("Error while writing files: " + ex));
            }
        }
Example #6
0
        private void FixHyphens(ChapterJSON chapterJSON, List <Line> lines, Line hyphenedLine, int fileNum)
        {
            /*
             * The way this works is the following:
             * Check for nearest bounding box in the page that has a left upper corner that is greater the y of the left lower corner (candidate box)
             * Check if the two bounding boxes intersect, if the bounding box y of the upper corners are raised for the candidate box.  We check
             *    by adding the total height of the box we are checking *against*.
             * If there is a match, then halt(?).  Proceed to insert the following entries to the master and chapter dictionaries:
             * Last word of the line **with** the hyphen, then followed by the first word in the candidate box appended.
             * Last word on the line **without** the hyphen, then followed by the first word in the candidate box appended.  This will NOT occur
             *        if the first word is whitelisted to alway exist with a hyphen in NO_HYPHEN_LIST.
             * Note that the word by itself with no hyphen will already be covered by the main program.
             */

            IReadOnlyList <string> HAS_HYPHEN_LIST = new List <string> {
                "SAMA", "SAN", "SENSEI", "KUN", "CHAN", "BO", "SENPAI"
            };
            IReadOnlyList <string> NO_HYPHEN_LIST = new List <string> {
                "ING", "INGS"
            };

            if (hyphenedLine.Text.EndsWith('-'))
            {
                if (hyphenedLine.BoundingBox.Count == 8)
                {
                    var hyphenLineTopLeftCoords = new Coords {
                        X = hyphenedLine.BoundingBox.ElementAt(0), Y = hyphenedLine.BoundingBox.ElementAt(1)
                    };
                    var hyphenLineBottomRightCoords = new Coords {
                        X = hyphenedLine.BoundingBox.ElementAt(4), Y = hyphenedLine.BoundingBox.ElementAt(5)
                    };

                    foreach (var candidateLine in lines)
                    {
                        if (candidateLine.BoundingBox.Count == 8)
                        {
                            var candidateLineTopLeftCoords = new Coords {
                                X = candidateLine.BoundingBox.ElementAt(0), Y = candidateLine.BoundingBox.ElementAt(1)
                            };
                            var candidateLineBottomRightCoords = new Coords {
                                X = candidateLine.BoundingBox.ElementAt(4), Y = candidateLine.BoundingBox.ElementAt(5)
                            };

                            // We skip if the coords are the same... as then the line we're looking at is equal to the hyphen line
                            if (candidateLineTopLeftCoords != hyphenLineTopLeftCoords)
                            {
                                if (hyphenLineTopLeftCoords.Y < candidateLineTopLeftCoords.Y)
                                {
                                    if (DoIntersect(hyphenLineTopLeftCoords, hyphenLineBottomRightCoords, candidateLineTopLeftCoords, candidateLineBottomRightCoords, Math.Abs(hyphenLineTopLeftCoords.Y - hyphenLineBottomRightCoords.Y)))
                                    {
                                        string hyphenedWord      = hyphenedLine.Words.Last().Text + AutoCorrectList(candidateLine.Words.First().Text);
                                        string hyphenedUpperWord = RegexFormatAndToUpperWord(hyphenedWord);
                                        if (hyphenedUpperWord.Length > 0)
                                        {
                                            InsertWordToChapterDict(hyphenedUpperWord, fileNum, chapterJSON);
                                        }

                                        // Skip if the word MUST contain a hyphen!
                                        if (!HAS_HYPHEN_LIST.Contains(RegexFormatAndToUpperWord(candidateLine.Words.First().Text.ToUpper())))
                                        {
                                            string unHyphenedWord      = AutoCorrectList(RegexFormatAndToUpperWord(hyphenedLine.Words.Last().Text)) + AutoCorrectList(RegexFormatAndToUpperWord(candidateLine.Words.First().Text));
                                            string unHyphenedUpperWord = RegexFormatAndToUpperWord(unHyphenedWord);
                                            logger.Info("Inserting: " + unHyphenedUpperWord);
                                            if (hyphenedUpperWord.Length > 0)
                                            {
                                                InsertWordToChapterDict(unHyphenedUpperWord, fileNum, chapterJSON);
                                            }
                                        }

                                        break;
                                    }
                                }
                            }
                        }
                        else
                        {
                            // This should never occur, BUT I've had this issue using Google's Vision API
                            logger.Warn("Skipping, current candidate line: " + hyphenedLine.Text + " somehow had more or less than 8 coordinates...");
                        }
                    }
                }
                else
                {
                    // This should never occur, BUT I've had this issue using Google's Vision API
                    logger.Warn("Skipping, current hyphened line: " + hyphenedLine.Text + " somehow had more or less than 8 coordinates...");
                }
            }
        }
Example #7
0
        private void FixHyphens(ChapterJSON chapterJSON, FormattedWord hyphenedFormattedWord, int fileNum)
        {
            Console.WriteLine("Hyphened word: " + hyphenedFormattedWord.Text);
            if (!(hyphenedFormattedWord.Text.EndsWith('-') || hyphenedFormattedWord.Text.StartsWith('-')))
            {
                return;
            }

            IReadOnlyList <string> HAS_HYPHEN_LIST = new List <string> {
                "SAMA", "SAN", "SENSEI", "KUN", "CHAN", "SENPAI"
            };
            IReadOnlyList <string> NO_HYPHEN_LIST = new List <string> {
                "ING", "INGS"
            };


            foreach (var candidateLine in chapterJSON.Pages[fileNum - 1].Words)
            {
                // We skip if the coords are the same... as then the line we're looking at is equal to the hyphen line
                if (candidateLine.BoundingBox[0] != hyphenedFormattedWord.BoundingBox[0] && candidateLine.BoundingBox[2] != hyphenedFormattedWord.BoundingBox[2])
                {
                    if (hyphenedFormattedWord.Text.EndsWith('-'))
                    {
                        if (hyphenedFormattedWord.BoundingBox[0].Y < candidateLine.BoundingBox[0].Y)
                        {
                            logger.Debug("Word: " + candidateLine.Text);
                            if (DoIntersect(hyphenedFormattedWord.BoundingBox[0], hyphenedFormattedWord.BoundingBox[2], candidateLine.BoundingBox[0], candidateLine.BoundingBox[2], Math.Abs(hyphenedFormattedWord.BoundingBox[0].Y - hyphenedFormattedWord.BoundingBox[3].Y)))
                            {
                                if (!NO_HYPHEN_LIST.Contains(AutoCorrectList(TrimAllHyphens(RegexFormatAndToUpperWord(candidateLine.Text.Split(' ').First())))))
                                {
                                    string hyphenedWord      = AutoCorrectList(hyphenedFormattedWord.Text.Split(' ').Last()) + AutoCorrectList(RegexFormatAndToUpperWord(candidateLine.Text.Split(' ').First()));
                                    string hyphenedUpperWord = RegexFormatAndToUpperWord(hyphenedWord);
                                    if (hyphenedUpperWord.Length > 0)
                                    {
                                        Console.WriteLine("Inserting: " + hyphenedUpperWord);
                                        InsertWordToChapterDict(hyphenedUpperWord, fileNum, chapterJSON);
                                    }
                                }

                                // Skip if the word MUST contain a hyphen!
                                if (!HAS_HYPHEN_LIST.Contains(AutoCorrectList(TrimAllHyphens(RegexFormatAndToUpperWord(candidateLine.Text.Split(' ').First())))))
                                {
                                    string unHyphenedWord      = AutoCorrectList(RegexFormatAndToUpperWord(hyphenedFormattedWord.Text.Split(' ').Last())) + AutoCorrectList(RegexFormatAndToUpperWord(candidateLine.Text.Split(' ').First()));
                                    string unHyphenedUpperWord = RegexFormatAndToUpperWord(unHyphenedWord);

                                    if (unHyphenedUpperWord.Length > 0)
                                    {
                                        Console.WriteLine("Inserting no-hyphen: " + unHyphenedUpperWord);
                                        InsertWordToChapterDict(unHyphenedUpperWord, fileNum, chapterJSON);
                                    }
                                }

                                break;
                            }
                        }
                    }
                    else
                    {
                        if (hyphenedFormattedWord.BoundingBox[0].Y > candidateLine.BoundingBox[0].Y)
                        {
                            if (DoIntersect(candidateLine.BoundingBox[0], candidateLine.BoundingBox[2], hyphenedFormattedWord.BoundingBox[0], hyphenedFormattedWord.BoundingBox[2], Math.Abs(hyphenedFormattedWord.BoundingBox[0].Y - hyphenedFormattedWord.BoundingBox[3].Y)))
                            {
                                if (!NO_HYPHEN_LIST.Contains(AutoCorrectList(TrimAllHyphens(RegexFormatAndToUpperWord(hyphenedFormattedWord.Text.Split(' ').First())))))
                                {
                                    string hyphenedWord      = AutoCorrectList(candidateLine.Text.Split(' ').Last()) + AutoCorrectList(RegexFormatAndToUpperWord(hyphenedFormattedWord.Text.Split(' ').First()));
                                    string hyphenedUpperWord = RegexFormatAndToUpperWord(hyphenedWord);
                                    if (hyphenedUpperWord.Length > 0)
                                    {
                                        Console.WriteLine("Inserting: " + hyphenedUpperWord);
                                        InsertWordToChapterDict(hyphenedUpperWord, fileNum, chapterJSON);
                                    }
                                }

                                // Skip if the word MUST contain a hyphen!
                                if (!HAS_HYPHEN_LIST.Contains(AutoCorrectList(TrimAllHyphens(RegexFormatAndToUpperWord(hyphenedFormattedWord.Text.Split(' ').First())))))
                                {
                                    string unHyphenedWord      = AutoCorrectList(RegexFormatAndToUpperWord(candidateLine.Text.Split(' ').Last())) + AutoCorrectList(TrimAllHyphens(RegexFormatAndToUpperWord(hyphenedFormattedWord.Text.Split(' ').First())));
                                    string unHyphenedUpperWord = RegexFormatAndToUpperWord(unHyphenedWord);

                                    if (unHyphenedUpperWord.Length > 0)
                                    {
                                        Console.WriteLine("Inserting: " + unHyphenedUpperWord);
                                        InsertWordToChapterDict(unHyphenedUpperWord, fileNum, chapterJSON);
                                    }
                                }

                                break;
                            }
                        }
                    }
                }
            }
        }
Example #8
0
        private void AddExistingChapterJSONToMasterDictionary(ChapterJSON chapterJSON, string masterFilePath)
        {
            MasterDictionary masterDictionary = new MasterDictionary();

            if (File.Exists(masterFilePath))
            {
                try
                {
                    masterDictionary = JsonConvert.DeserializeObject <MasterDictionary>(File.ReadAllText(masterFilePath));
                }
                catch
                {
                    logger.Error(string.Format("Unable to open the master dictionary at {0}.  Halting.", masterFilePath));
                }
            }
            else
            {
                masterDictionary = new MasterDictionary
                {
                    MentionedWordLocation = new SortedDictionary <string, SortedDictionary <string, List <int> > >()
                };
                logger.Info("Created new dictionary.");
            }

            double chapterNum = chapterJSON.Chapter;

            logger.Info(string.Format("Formatting chapter {0}.", chapterNum));
            string chapterNumString = chapterNum.ToString().Replace(".", "-");

            foreach (var wordEntry in chapterJSON.MentionedWordChapterLocation)
            {
                var pageList  = wordEntry.Value;
                var upperWord = wordEntry.Key;
                upperWord = AutoCorrectList(upperWord);

                if (masterDictionary.MentionedWordLocation.ContainsKey(upperWord))
                {
                    if (!masterDictionary.MentionedWordLocation[upperWord].ContainsKey(chapterNumString))
                    {
                        // Chapter dictionary DNE!
                        masterDictionary.MentionedWordLocation[upperWord].Add(chapterNumString, pageList);
                    }
                    else
                    {
                        foreach (var pageNum in pageList)
                        {
                            if (!masterDictionary.MentionedWordLocation[upperWord][chapterNumString].Contains(pageNum))
                            {
                                masterDictionary.MentionedWordLocation[upperWord][chapterNumString].Add(pageNum);
                            }
                        }
                    }
                }
                else
                {
                    // Word DNE in dictionary
                    var newChapterDict = new SortedDictionary <string, List <int> >
                    {
                        { chapterNumString, pageList }
                    };
                    masterDictionary.MentionedWordLocation.Add(upperWord, newChapterDict);
                }
            }

            GenerateMasterJSON(masterFilePath, masterDictionary);
        }