/// ------------------------------------------------------------------------------------ /// <summary> /// Gets a list if TextTokenSubstrings conataining the references and character offsets /// where repeated words occur. /// </summary> /// <param name="tokens">The tokens (from the data source) to check for repeated words. /// </param> /// <param name="_desiredKey">If looking for occurrences of a specific repeated word, /// set this to be that word; otherwise pass an empty string.</param> /// <returns></returns> /// ------------------------------------------------------------------------------------ public List <TextTokenSubstring> GetReferences(IEnumerable <ITextToken> tokens, string desiredKey) { #if DEBUG List <ITextToken> AllTokens = new List <ITextToken>(tokens); #endif characterCategorizer = m_checksDataSource.CharacterCategorizer; // Get a string of words that may be validly repeated. // Words are separated by blanks. ValidItems = m_checksDataSource.GetParameterValue("RepeatableWords"); // List of words that are known to be not repeatable. InvalidItems = m_checksDataSource.GetParameterValue("NonRepeatableWords"); TextType prevTextType = TextType.Other; m_repeatedWords = new List <TextTokenSubstring>(); ProcessRepeatedWords bodyProcessor = new ProcessRepeatedWords(characterCategorizer, m_repeatedWords, desiredKey); ProcessRepeatedWords noteProcessor = new ProcessRepeatedWords(characterCategorizer, m_repeatedWords, desiredKey); foreach (ITextToken tok in tokens) { if (tok.IsParagraphStart) { noteProcessor.Reset(); bodyProcessor.Reset(); } if (tok.TextType == TextType.Note) { if (tok.IsNoteStart) { noteProcessor.Reset(); } noteProcessor.ProcessToken(tok); } // When we leave a caption, we start over checking for repeated words. // A caption is a start of a paragraph, so we already start over // when we encounter a picture caption. if (prevTextType == TextType.PictureCaption) { noteProcessor.Reset(); } if (tok.TextType == TextType.Verse || tok.TextType == TextType.Other) { noteProcessor.Reset(); bodyProcessor.ProcessToken(tok); } if (tok.TextType == TextType.ChapterNumber) { bodyProcessor.Reset(); } prevTextType = tok.TextType; } return(m_repeatedWords); }
/// ------------------------------------------------------------------------------------ /// <summary> /// Get all instances of the item being checked in the token list passed. /// This includes both valid and invalid instances. /// This is used 1) to create an inventory of these items. /// To show the user all instance of an item with a specified key. /// 2) With a "desiredKey" in order to fetch instance of a specific /// item (e.g. all the places where "the" is a repeated word. /// </summary> /// <param name="tokens">Tokens for text to be scanned</param> /// <param name="desiredKey">If you only want instance of a specific key (e.g. one word, /// one punctuation pattern, one character, etc.) place it here. Empty string returns /// all items.</param> /// <returns>List of token substrings</returns> /// ------------------------------------------------------------------------------------ public List <TextTokenSubstring> GetReferences(IEnumerable <ITextToken> tokens, string desiredKey) { #if DEBUG List <ITextToken> AllTokens = new List <ITextToken>(tokens); if (AllTokens.Count == 0) { // Keep the compiler from complaining about assigning to a variable, but not using it. } #endif m_characterCategorizer = m_checksDataSource.CharacterCategorizer; ValidItems = m_checksDataSource.GetParameterValue(kValidItemsParameter); InvalidItems = m_checksDataSource.GetParameterValue(kInvalidItemsParameter); string preferredLocale = m_checksDataSource.GetParameterValue("PreferredLocale") ?? string.Empty; m_mixedCapitalization = new List <TextTokenSubstring>(); ProcessMixedCapitalization processor = new ProcessMixedCapitalization(m_checksDataSource, m_mixedCapitalization); foreach (ITextToken tok in tokens) { if ((tok.Locale ?? string.Empty) != preferredLocale) { continue; } foreach (WordAndPunct wap in m_characterCategorizer.WordAndPuncts(tok.Text)) { processor.ProcessWord(tok, wap, desiredKey); } } return(m_mixedCapitalization); }
/// ------------------------------------------------------------------------------------ /// <summary> /// Initializes a new instance of the <see cref="ProcessSentenceFinalPunct"/> class. /// </summary> /// <param name="checksDataSource">The source of data for Scripture checking.</param> /// <param name="allCapitalizedStyles">Dictionary keyed by the style name containing the /// type of style (character/paragraph) and a value indicating why it should begin with /// a capital.</param> /// ------------------------------------------------------------------------------------ public CapitalizationProcessor(IChecksDataSource checksDataSource, Dictionary <string, StyleCapInfo> allCapitalizedStyles) { m_checksDataSource = checksDataSource; m_categorizer = checksDataSource.CharacterCategorizer; m_abbreviations = checksDataSource.GetParameterValue("Abbreviations").Split(); m_allCapitalizedStyles = allCapitalizedStyles; string sentenceFinalPunc = checksDataSource.GetParameterValue("SentenceFinalPunctuation"); if (!string.IsNullOrEmpty(sentenceFinalPunc)) { foreach (char ch in sentenceFinalPunc) { m_validSentenceFinalPuncts.Add(ch); } } else { // No punctuation is set up for this writing system that contains sentence-final punctuation. // Define sentence-final punctuation with these characters as a fallback: '.', '?', and '!' m_validSentenceFinalPuncts.Add('.'); m_validSentenceFinalPuncts.Add('?'); m_validSentenceFinalPuncts.Add('!'); } }
/// ------------------------------------------------------------------------------------ /// <summary> /// /// </summary> /// <param name="checksDataSource"></param> /// <param name="result"></param> /// ------------------------------------------------------------------------------------ public ProcessMixedCapitalization(IChecksDataSource checksDataSource, List <TextTokenSubstring> result) { m_categorizer = checksDataSource.CharacterCategorizer; m_result = result; m_uncapitalizedPrefixes = new List <string>( checksDataSource.GetParameterValue("UncapitalizedPrefixes").Split()); m_capitalizedSuffixes = new List <string>( checksDataSource.GetParameterValue("CapitalizedSuffixes").Split()); m_capitalizedPrefixes = new List <string>( checksDataSource.GetParameterValue("CapitalizedPrefixes").Split()); }
/// ------------------------------------------------------------------------------------ /// <summary> /// Gets the references where capitalization errors occurred. /// </summary> /// <param name="tokens">The Scripture tokens.</param> /// <returns>list of capitalization errors.</returns> /// ------------------------------------------------------------------------------------ public List <TextTokenSubstring> GetReferences(IEnumerable <ITextToken> tokens) { m_SentenceFinalPunc = m_chkDataSource.GetParameterValue(kSentenceFinalPuncParameter); if (m_stylePropsInfo == null) { string styleInfo = m_chkDataSource.GetParameterValue(kStyleSheetInfoParameter); Debug.Assert(!string.IsNullOrEmpty(styleInfo), "Style information not provided."); m_stylePropsInfo = StylePropsInfo.Load(styleInfo); CreateCapitalStyleDictionary(); Debug.Assert(m_allCapitalizedStyles.Count > 0, "No styles require capitalization."); } CapitalizationProcessor bodyPuncProcessor = new CapitalizationProcessor(m_chkDataSource, m_allCapitalizedStyles); CapitalizationProcessor notePuncProcessor = new CapitalizationProcessor(m_chkDataSource, m_allCapitalizedStyles); notePuncProcessor.ProcessParagraphsSeparately = true; m_capitalizationErrors = new List <TextTokenSubstring>(); VerseTextToken scrTok = new VerseTextToken(); ITextToken tok; foreach (ITextToken token in tokens) { if (token.TextType == TextType.Note || token.TextType == TextType.PictureCaption) { tok = token; } else { // Make the token one of our special capitalization text tokens. scrTok.Token = token; tok = scrTok; } if (tok.TextType == TextType.Note) { notePuncProcessor.ProcessToken(tok, m_capitalizationErrors); } else if (tok.TextType == TextType.Verse || tok.TextType == TextType.Other) { bodyPuncProcessor.ProcessToken(tok, m_capitalizationErrors); } } return(m_capitalizationErrors); }
/// ------------------------------------------------------------------------------------ /// <summary> /// Gets the parameters needed for this check. /// </summary> /// ------------------------------------------------------------------------------------ private void GetParameters() { m_versificationScheme = m_checksDataSource.GetParameterValue(ksVerseSchemeParam); ScrVers scrVers; try { scrVers = (ScrVers)Enum.Parse(typeof(ScrVers), m_versificationScheme); } catch { // Default to English scrVers = ScrVers.English; } m_versification = VersificationTable.Get(scrVers); m_sBookId = m_checksDataSource.GetParameterValue(ksBookIdParam); if (!int.TryParse(m_checksDataSource.GetParameterValue(ksChapterParam), out m_nChapterToCheck)) { m_nChapterToCheck = 0; } string temp = m_checksDataSource.GetParameterValue(ksVerseBridgeParam); string verseBridge = (string.IsNullOrEmpty(temp)) ? "-" : temp; temp = m_checksDataSource.GetParameterValue(ksScriptDigitZeroParam); char scriptDigitZero = (string.IsNullOrEmpty(temp)) ? '0' : temp[0]; string numberRange = string.Format("[{1}-{2}][{0}-{2}]*", scriptDigitZero, (char)(scriptDigitZero + 1), (char)(scriptDigitZero + 9)); temp = m_checksDataSource.GetParameterValue(ksSubVerseLetterAParam); if (!string.IsNullOrEmpty(temp)) { m_subVerseA = temp; } temp = m_checksDataSource.GetParameterValue(ksSubVerseLetterBParam); if (!string.IsNullOrEmpty(temp)) { m_subVerseB = temp; } string subverseRange = string.Format("[{0}{1}]?", m_subVerseA, m_subVerseB); // Original Regex for Roman script: "^[1-9][0-9]{0,2}[ab]?(-[1-9][0-9]{0,2}[ab]?)?$" m_verseNumberFormat = new Regex(String.Format("^{0}{1}({2}{0}{1})?$", numberRange, subverseRange, verseBridge)); m_chapterNumberFormat = new Regex("^" + numberRange + "$"); }
/// ------------------------------------------------------------------------------------ /// <summary> /// Initializes a new instance of the <see cref="ProcessSentenceFinalPunct"/> class. /// </summary> /// <param name="checksDataSource">The source of data for Scripture checking.</param> /// <param name="allCapitalizedStyles">Dictionary keyed by the style name containing the /// type of style (character/paragraph) and a value indicating why it should begin with /// a capital.</param> /// ------------------------------------------------------------------------------------ public CapitalizationProcessor(IChecksDataSource checksDataSource, Dictionary<string, StyleCapInfo> allCapitalizedStyles) { m_checksDataSource = checksDataSource; m_categorizer = checksDataSource.CharacterCategorizer; m_abbreviations = checksDataSource.GetParameterValue("Abbreviations").Split(); m_allCapitalizedStyles = allCapitalizedStyles; string sentenceFinalPunc = checksDataSource.GetParameterValue("SentenceFinalPunctuation"); if (!string.IsNullOrEmpty(sentenceFinalPunc)) { foreach (char ch in sentenceFinalPunc) m_validSentenceFinalPuncts.Add(ch); } else { // No punctuation is set up for this writing system that contains sentence-final punctuation. // Define sentence-final punctuation with these characters as a fallback: '.', '?', and '!' m_validSentenceFinalPuncts.Add('.'); m_validSentenceFinalPuncts.Add('?'); m_validSentenceFinalPuncts.Add('!'); } }
/// ------------------------------------------------------------------------------------ /// <summary> /// /// </summary> /// ------------------------------------------------------------------------------------ public List <TextTokenSubstring> GetReferences(IEnumerable <ITextToken> tokens, string desiredKey) { #if DEBUG List <ITextToken> AllTokens = new List <ITextToken>(tokens); if (AllTokens.Count == 0) { // Keep the compiler from complaining about assigning to a variable, but not using it. } #endif // m_characterCategorizer = m_checksDataSource.CharacterCategorizer; ValidItems = m_checksDataSource.GetParameterValue(kValidItemsParameter); InvalidItems = m_checksDataSource.GetParameterValue(kInvalidItemsParameter); string preferredLocale = m_checksDataSource.GetParameterValue("PreferredLocale") ?? string.Empty; string poeticStyles = m_checksDataSource.GetParameterValue("PoeticStyles"); string introductionOutlineStyles = m_checksDataSource.GetParameterValue("IntroductionOutlineStyles"); MatchedPairList pairList = MatchedPairList.Load(m_checksDataSource.GetParameterValue("MatchedPairs"), m_checksDataSource.GetParameterValue("DefaultWritingSystemName")); StyleCategorizer styleCategorizer = new StyleCategorizer(poeticStyles, introductionOutlineStyles); ProcessMatchedPairTokens bodyProcessor = new ProcessMatchedPairTokens( m_checksDataSource, pairList, styleCategorizer); ProcessMatchedPairTokens noteProcessor = new ProcessMatchedPairTokens( m_checksDataSource, pairList, styleCategorizer); m_unmatchedPairs = new List <TextTokenSubstring>(); foreach (ITextToken tok in tokens) { if (tok.Text == null || (tok.Locale ?? string.Empty) != preferredLocale) { continue; } if (tok.TextType == TextType.Note) { // if a new note is starting finalize any sequences from the previous note if (tok.IsNoteStart) { noteProcessor.FinalizeResult(desiredKey, m_unmatchedPairs); } noteProcessor.ProcessToken(tok, desiredKey, m_unmatchedPairs); } else if (tok.TextType == TextType.Verse || tok.TextType == TextType.Other || tok.IsParagraphStart) { // body text: finalize any note that was in progress and continue with body text noteProcessor.FinalizeResult(desiredKey, m_unmatchedPairs); bodyProcessor.ProcessToken(tok, desiredKey, m_unmatchedPairs); } } noteProcessor.FinalizeResult(desiredKey, m_unmatchedPairs); bodyProcessor.FinalizeResult(desiredKey, m_unmatchedPairs); return(m_unmatchedPairs); }
/// ------------------------------------------------------------------------------------ /// <summary> /// /// </summary> /// <param name="checksDataSource"></param> /// <param name="result"></param> /// ------------------------------------------------------------------------------------ public ProcessMixedCapitalization(IChecksDataSource checksDataSource, List<TextTokenSubstring> result) { m_categorizer = checksDataSource.CharacterCategorizer; m_result = result; m_uncapitalizedPrefixes = new List<string>( checksDataSource.GetParameterValue("UncapitalizedPrefixes").Split()); m_capitalizedSuffixes = new List<string>( checksDataSource.GetParameterValue("CapitalizedSuffixes").Split()); m_capitalizedPrefixes = new List<string>( checksDataSource.GetParameterValue("CapitalizedPrefixes").Split()); }
/// ------------------------------------------------------------------------------------ /// <summary> /// Return a TextTokenSubstring for all occurances of the desiredKey. /// </summary> /// <param name="tokens"></param> /// <param name="desiredKey">e.g., _[_ or empty string to look for all patterns</param> /// <returns></returns> /// ------------------------------------------------------------------------------------ public List <TextTokenSubstring> GetReferences(IEnumerable <ITextToken> tokens, string desiredKey) { #if DEBUG List <ITextToken> AllTokens = new List <ITextToken>(tokens); if (AllTokens.Count == 0) { // Keep the compiler from complaining about assigning to a variable, but not using it. } #endif m_characterCategorizer = m_checksDataSource.CharacterCategorizer; string sXmlMatchedPairs = m_checksDataSource.GetParameterValue("PunctuationPatterns"); if (sXmlMatchedPairs != null && sXmlMatchedPairs.Trim().Length > 0) { m_validItemsList = new List <string>(); m_invalidItemsList = new List <string>(); PuncPatternsList puncPatternsList = PuncPatternsList.Load(sXmlMatchedPairs, m_checksDataSource.GetParameterValue("DefaultWritingSystemName")); foreach (PuncPattern pattern in puncPatternsList) { if (pattern.Valid) { m_validItemsList.Add(pattern.Pattern); } else { m_invalidItemsList.Add(pattern.Pattern); } } } else { ValidItems = m_checksDataSource.GetParameterValue(kValidItemsParameter); InvalidItems = m_checksDataSource.GetParameterValue(kInvalidItemsParameter); } string sLevel = m_checksDataSource.GetParameterValue("PunctCheckLevel"); CheckingLevel level; switch (sLevel) { case "Advanced": level = CheckingLevel.Advanced; break; case "Intermediate": level = CheckingLevel.Intermediate; break; case "Basic": default: level = CheckingLevel.Basic; break; } string sWhitespaceRep = m_checksDataSource.GetParameterValue("PunctWhitespaceChar"); if (!String.IsNullOrEmpty(sWhitespaceRep)) { s_whitespaceRep = sWhitespaceRep.Substring(0, 1); } string preferredLocale = m_checksDataSource.GetParameterValue("PreferredLocale") ?? string.Empty; QuotationMarkCategorizer quotationCategorizer = new QuotationMarkCategorizer(m_checksDataSource); // create processing state machines, one for body text, one for notes ProcessPunctationTokens bodyProcessor = new ProcessPunctationTokens( m_characterCategorizer, quotationCategorizer, level); ProcessPunctationTokens noteProcessor = new ProcessPunctationTokens( m_characterCategorizer, quotationCategorizer, level); m_punctuationSequences = new List <TextTokenSubstring>(); // build list of note and non-note tokens foreach (ITextToken tok in tokens) { if (tok.Text == null || (tok.Locale ?? string.Empty) != preferredLocale) { continue; } if (tok.TextType == TextType.Note) { // if a new note is starting finalize any punctuation sequences from the previous note if (tok.IsNoteStart) { noteProcessor.FinalizeResult(desiredKey, m_punctuationSequences, true); } noteProcessor.ProcessToken(tok, desiredKey, m_punctuationSequences); } else if (tok.TextType == TextType.Verse || tok.TextType == TextType.Other) { // body text: finalize any note that was in progress and continue with body text noteProcessor.FinalizeResult(desiredKey, m_punctuationSequences, true); bodyProcessor.ProcessToken(tok, desiredKey, m_punctuationSequences); } else if (tok.IsParagraphStart) { bodyProcessor.FinalizeResult(desiredKey, m_punctuationSequences, true); bodyProcessor.TreatAsParagraphStart = true; } } noteProcessor.FinalizeResult(desiredKey, m_punctuationSequences, true); bodyProcessor.FinalizeResult(desiredKey, m_punctuationSequences, true); return(m_punctuationSequences); }
/// ------------------------------------------------------------------------------------ /// <summary> /// Get (invalid) character references. /// </summary> /// ------------------------------------------------------------------------------------ private List <TextTokenSubstring> GetReferences(IEnumerable <ITextToken> tokens, string desiredKey, bool invalidCharactersOnly) { if (m_categorizer == null) { m_categorizer = m_checksDataSource.CharacterCategorizer; } m_characterSequences = new List <TextTokenSubstring>(); Dictionary <string, Dictionary <string, bool> > htValidChars = new Dictionary <string, Dictionary <string, bool> >(); Dictionary <string, bool> currentDictionary = null; string preferredLocale = m_checksDataSource.GetParameterValue("PreferredLocale") ?? string.Empty; foreach (ITextToken tok in tokens) { string locale = tok.Locale ?? string.Empty; if (tok.Text == null || (!invalidCharactersOnly && locale != preferredLocale)) { continue; } if (!htValidChars.TryGetValue(locale, out currentDictionary)) { currentDictionary = StringToDictionary(GetValidCharacters(locale)); htValidChars.Add(locale, currentDictionary); } int offset = 0; foreach (string key in ParseCharacterSequences(tok.Text)) { bool lookingForASpecificKey = (desiredKey != ""); bool keyMatches = (desiredKey == key); bool invalidItem = false; if (invalidCharactersOnly) { // REVIEW (BobbydV): IndexOf causes false positives for certain // characters (e.g., U+0234 & U+1234). I think Contains is easier to read // and should work for both TE and Paratext for the "AlwaysValidCharacters" // list. (TomB) if (!m_alwaysValidCharacters.Contains(key) && !currentDictionary.ContainsKey(key)) { invalidItem = true; } } if ((lookingForASpecificKey && keyMatches) || (!lookingForASpecificKey && !invalidCharactersOnly) || (invalidCharactersOnly && invalidItem)) { TextTokenSubstring tts = new TextTokenSubstring(tok, offset, key.Length); m_characterSequences.Add(tts); } offset += key.Length; } } return(m_characterSequences); }
/// ------------------------------------------------------------------------------------ /// <summary> /// Initializes a new instance of the <see cref="QuotationMarkCategorizer"/> class. /// </summary> /// ------------------------------------------------------------------------------------ internal QuotationMarkCategorizer(IChecksDataSource source) { m_quoteMarks = QuotationMarksList.Load(source.GetParameterValue("QuotationMarkInfo"), source.GetParameterValue("DefaultWritingSystemName")); m_styleInfo = StylePropsInfo.Load(source.GetParameterValue("StylesInfo")); CollapseAdjacentQuotes = source.GetParameterValue("CollapseAdjacentQuotes") == "No"; }
/// ------------------------------------------------------------------------------------ /// <summary> /// Initializes a new instance of the <see cref="QuotationRelatedTokensProcessor"/> class. /// </summary> /// <param name="source">The checks data source.</param> /// <param name="charCategorizer">The character categorizer.</param> /// <param name="qmCategorizer">The quotation mark categorizer.</param> /// <param name="desiredKey">The desired key (can be string.Empty).</param> /// <param name="results">The result.</param> /// ------------------------------------------------------------------------------------ internal QTokenProcessor(IChecksDataSource dataSource, CharacterCategorizer charCategorizer, QuotationMarkCategorizer qmCategorizer, string desiredKey, List<TextTokenSubstring> results) { m_chkDataSource = dataSource; m_charCategorizer = charCategorizer; m_qmCategorizer = qmCategorizer; m_desiredKey = desiredKey; m_results = results; m_verboseQuotes = (m_chkDataSource.GetParameterValue("VerboseQuotes") == "Yes"); m_noCloserMsg = Localize("Unmatched opening mark: level {0}"); m_noOpenerMsg = Localize("Unmatched closing mark: level {0}"); m_regExQuotes = new Regex(qmCategorizer.Pattern); m_regExNonQuotes = new Regex(string.Format("[^{0}|\\s]", qmCategorizer.Pattern.Replace("]", "\\]"))); // Make sure brackets are escaped }