Пример #1
0
        /// ------------------------------------------------------------------------------------
        /// <summary>
        /// Get all instances of the item being checked in the token list passed.
        /// This includes both valid and invalid instances.
        /// This is used 1) to create an inventory of these items.
        /// To show the user all instance of an item with a specified key.
        /// 2) With a "desiredKey" in order to fetch instance of a specific
        /// item (e.g. all the places where "the" is a repeated word.
        /// </summary>
        /// <param name="tokens">Tokens for text to be scanned</param>
        /// <param name="desiredKey">If you only want instance of a specific key (e.g. one word,
        /// one punctuation pattern, one character, etc.) place it here. Empty string returns
        /// all items.</param>
        /// <returns>List of token substrings</returns>
        /// ------------------------------------------------------------------------------------
        public List <TextTokenSubstring> GetReferences(IEnumerable <ITextToken> tokens, string desiredKey)
        {
#if DEBUG
            List <ITextToken> AllTokens = new List <ITextToken>(tokens);
            if (AllTokens.Count == 0)
            {
                // Keep the compiler from complaining about assigning to a variable, but not using it.
            }
#endif
            m_characterCategorizer = m_checksDataSource.CharacterCategorizer;
            ValidItems             = m_checksDataSource.GetParameterValue(kValidItemsParameter);
            InvalidItems           = m_checksDataSource.GetParameterValue(kInvalidItemsParameter);

            string preferredLocale =
                m_checksDataSource.GetParameterValue("PreferredLocale") ?? string.Empty;

            m_mixedCapitalization = new List <TextTokenSubstring>();
            ProcessMixedCapitalization processor =
                new ProcessMixedCapitalization(m_checksDataSource, m_mixedCapitalization);

            foreach (ITextToken tok in tokens)
            {
                if ((tok.Locale ?? string.Empty) != preferredLocale)
                {
                    continue;
                }

                foreach (WordAndPunct wap in m_characterCategorizer.WordAndPuncts(tok.Text))
                {
                    processor.ProcessWord(tok, wap, desiredKey);
                }
            }

            return(m_mixedCapitalization);
        }
Пример #2
0
 public ProcessRepeatedWords(CharacterCategorizer characterCategorizer,
                             List <TextTokenSubstring> result, string desiredKey)
 {
     this.characterCategorizer = characterCategorizer;
     this.result     = result;
     this.desiredKey = desiredKey.ToLower();
 }
Пример #3
0
        /// ------------------------------------------------------------------------------------
        /// <summary>
        /// Initializes a new instance of the <see cref="ProcessSentenceFinalPunct"/> class.
        /// </summary>
        /// <param name="checksDataSource">The source of data for Scripture checking.</param>
        /// <param name="allCapitalizedStyles">Dictionary keyed by the style name containing the
        /// type of style (character/paragraph) and a value indicating why it should begin with
        /// a capital.</param>
        /// ------------------------------------------------------------------------------------
        public CapitalizationProcessor(IChecksDataSource checksDataSource,
                                       Dictionary <string, StyleCapInfo> allCapitalizedStyles)
        {
            m_checksDataSource     = checksDataSource;
            m_categorizer          = checksDataSource.CharacterCategorizer;
            m_abbreviations        = checksDataSource.GetParameterValue("Abbreviations").Split();
            m_allCapitalizedStyles = allCapitalizedStyles;

            string sentenceFinalPunc = checksDataSource.GetParameterValue("SentenceFinalPunctuation");

            if (!string.IsNullOrEmpty(sentenceFinalPunc))
            {
                foreach (char ch in sentenceFinalPunc)
                {
                    m_validSentenceFinalPuncts.Add(ch);
                }
            }
            else
            {
                // No punctuation is set up for this writing system that contains sentence-final punctuation.
                // Define sentence-final punctuation with these characters as a fallback: '.', '?', and '!'
                m_validSentenceFinalPuncts.Add('.');
                m_validSentenceFinalPuncts.Add('?');
                m_validSentenceFinalPuncts.Add('!');
            }
        }
Пример #4
0
 /// ------------------------------------------------------------------------------------
 /// <summary>
 /// Initializes a new instance of the <see cref="ProcessPunctationTokens"/> class.
 /// </summary>
 /// <param name="categorizer">The categorizer.</param>
 /// <param name="quotationCategorizer">The quotation categorizer.</param>
 /// <param name="level">Indicator to determine how much to combine contiguous
 /// punctuation sequences into patterns. Advanced = All contiguous punctuation and
 /// whitespace characters form a single pattern; Intermediate = Contiguous punctuation
 /// forms a single pattern (delimeted by whitespace); Basic = Each punctuation character
 /// stands alone. In all three modes, whitespace before and/or after a punctuation token
 /// indicates whether is is word-initial, word-medial, word-final, or isolated</param>
 /// ------------------------------------------------------------------------------------
 public ProcessPunctationTokens(CharacterCategorizer categorizer,
                                QuotationMarkCategorizer quotationCategorizer, CheckingLevel level)
 {
     m_categorizer          = categorizer;
     m_quotationCategorizer = quotationCategorizer;
     m_level = level;
 }
Пример #5
0
        /// ------------------------------------------------------------------------------------
        /// <summary>
        /// Gets a list if TextTokenSubstrings conataining the references and character offsets
        /// where repeated words occur.
        /// </summary>
        /// <param name="tokens">The tokens (from the data source) to check for repeated words.
        /// </param>
        /// <param name="_desiredKey">If looking for occurrences of a specific repeated word,
        /// set this to be that word; otherwise pass an empty string.</param>
        /// <returns></returns>
        /// ------------------------------------------------------------------------------------
        public List <TextTokenSubstring> GetReferences(IEnumerable <ITextToken> tokens, string desiredKey)
        {
#if DEBUG
            List <ITextToken> AllTokens = new List <ITextToken>(tokens);
#endif
            characterCategorizer = m_checksDataSource.CharacterCategorizer;
            // Get a string of words that may be validly repeated.
            // Words are separated by blanks.
            ValidItems = m_checksDataSource.GetParameterValue("RepeatableWords");
            // List of words that are known to be not repeatable.
            InvalidItems = m_checksDataSource.GetParameterValue("NonRepeatableWords");

            TextType prevTextType = TextType.Other;
            m_repeatedWords = new List <TextTokenSubstring>();
            ProcessRepeatedWords bodyProcessor =
                new ProcessRepeatedWords(characterCategorizer, m_repeatedWords, desiredKey);
            ProcessRepeatedWords noteProcessor =
                new ProcessRepeatedWords(characterCategorizer, m_repeatedWords, desiredKey);

            foreach (ITextToken tok in tokens)
            {
                if (tok.IsParagraphStart)
                {
                    noteProcessor.Reset();
                    bodyProcessor.Reset();
                }

                if (tok.TextType == TextType.Note)
                {
                    if (tok.IsNoteStart)
                    {
                        noteProcessor.Reset();
                    }
                    noteProcessor.ProcessToken(tok);
                }

                // When we leave a caption, we start over checking for repeated words.
                // A caption is a start of a paragraph, so we already start over
                // when we encounter a picture caption.
                if (prevTextType == TextType.PictureCaption)
                {
                    noteProcessor.Reset();
                }

                if (tok.TextType == TextType.Verse || tok.TextType == TextType.Other)
                {
                    noteProcessor.Reset();
                    bodyProcessor.ProcessToken(tok);
                }

                if (tok.TextType == TextType.ChapterNumber)
                {
                    bodyProcessor.Reset();
                }

                prevTextType = tok.TextType;
            }

            return(m_repeatedWords);
        }
Пример #6
0
        /// ------------------------------------------------------------------------------------
        /// <summary>
        ///
        /// </summary>
        /// <param name="text"></param>
        /// <param name="categorizer"></param>
        /// ------------------------------------------------------------------------------------
        public AWord(string text, CharacterCategorizer categorizer)
        {
            this.m_text        = text;
            this.m_categorizer = categorizer;

            string word = CountLettersAndReturnWordWithOnlyWordFormingCharacters(text);

            if (m_lowerCaseLetters == 0 || m_upperCaseLetters == 0)
            {
                return;
            }
            FindPrefixAndSuffixIfAny(word);
        }
Пример #7
0
        /// ------------------------------------------------------------------------------------
        /// <summary>
        ///
        /// </summary>
        /// <param name="checksDataSource"></param>
        /// <param name="result"></param>
        /// ------------------------------------------------------------------------------------
        public ProcessMixedCapitalization(IChecksDataSource checksDataSource,
                                          List <TextTokenSubstring> result)
        {
            m_categorizer = checksDataSource.CharacterCategorizer;
            m_result      = result;

            m_uncapitalizedPrefixes = new List <string>(
                checksDataSource.GetParameterValue("UncapitalizedPrefixes").Split());

            m_capitalizedSuffixes = new List <string>(
                checksDataSource.GetParameterValue("CapitalizedSuffixes").Split());

            m_capitalizedPrefixes = new List <string>(
                checksDataSource.GetParameterValue("CapitalizedPrefixes").Split());
        }
Пример #8
0
        /// ------------------------------------------------------------------------------------
        /// <summary>
        /// Runs the Characters Scripture checks.
        /// </summary>
        /// <param name="toks">The Scripture tokens to check.</param>
        /// <param name="record">Method to record the error.</param>
        /// ------------------------------------------------------------------------------------
        public void Check(IEnumerable <ITextToken> toks, RecordErrorHandler record)
        {
            // This method is called in ScrChecksDataSource.cs - RunCheck(IScriptureCheck check)
            m_categorizer = m_checksDataSource.CharacterCategorizer;

            // Get parameters needed to run this check.
            GetParameters();

            // Find all invalid characters and place them in 'm_characterSequences'
            GetReferences(toks, string.Empty, true);

            foreach (TextTokenSubstring tts in m_characterSequences)
            {
                tts.Message = (tts.ToString().Length > 1) ?
                              m_checksDataSource.GetLocalizedString("Invalid or unknown character diacritic combination") :
                              m_checksDataSource.GetLocalizedString("Invalid or unknown character");

                record(new RecordErrorEventArgs(tts, CheckId));
            }
        }
Пример #9
0
        /// ------------------------------------------------------------------------------------
        /// <summary>
        /// Return a TextTokenSubstring for all occurances of the desiredKey.
        /// </summary>
        /// <param name="tokens"></param>
        /// <param name="desiredKey">e.g., _[_ or empty string to look for all patterns</param>
        /// <returns></returns>
        /// ------------------------------------------------------------------------------------
        public List <TextTokenSubstring> GetReferences(IEnumerable <ITextToken> tokens, string desiredKey)
        {
#if DEBUG
            List <ITextToken> AllTokens = new List <ITextToken>(tokens);
            if (AllTokens.Count == 0)
            {
                // Keep the compiler from complaining about assigning to a variable, but not using it.
            }
#endif
            m_characterCategorizer = m_checksDataSource.CharacterCategorizer;
            string sXmlMatchedPairs = m_checksDataSource.GetParameterValue("PunctuationPatterns");
            if (sXmlMatchedPairs != null && sXmlMatchedPairs.Trim().Length > 0)
            {
                m_validItemsList   = new List <string>();
                m_invalidItemsList = new List <string>();
                PuncPatternsList puncPatternsList = PuncPatternsList.Load(sXmlMatchedPairs,
                                                                          m_checksDataSource.GetParameterValue("DefaultWritingSystemName"));
                foreach (PuncPattern pattern in puncPatternsList)
                {
                    if (pattern.Valid)
                    {
                        m_validItemsList.Add(pattern.Pattern);
                    }
                    else
                    {
                        m_invalidItemsList.Add(pattern.Pattern);
                    }
                }
            }
            else
            {
                ValidItems   = m_checksDataSource.GetParameterValue(kValidItemsParameter);
                InvalidItems = m_checksDataSource.GetParameterValue(kInvalidItemsParameter);
            }

            string        sLevel = m_checksDataSource.GetParameterValue("PunctCheckLevel");
            CheckingLevel level;
            switch (sLevel)
            {
            case "Advanced": level = CheckingLevel.Advanced; break;

            case "Intermediate": level = CheckingLevel.Intermediate; break;

            case "Basic":
            default:
                level = CheckingLevel.Basic;
                break;
            }
            string sWhitespaceRep = m_checksDataSource.GetParameterValue("PunctWhitespaceChar");
            if (!String.IsNullOrEmpty(sWhitespaceRep))
            {
                s_whitespaceRep = sWhitespaceRep.Substring(0, 1);
            }
            string preferredLocale =
                m_checksDataSource.GetParameterValue("PreferredLocale") ?? string.Empty;

            QuotationMarkCategorizer quotationCategorizer =
                new QuotationMarkCategorizer(m_checksDataSource);

            // create processing state machines, one for body text, one for notes
            ProcessPunctationTokens bodyProcessor = new ProcessPunctationTokens(
                m_characterCategorizer, quotationCategorizer, level);

            ProcessPunctationTokens noteProcessor = new ProcessPunctationTokens(
                m_characterCategorizer, quotationCategorizer, level);

            m_punctuationSequences = new List <TextTokenSubstring>();

            // build list of note and non-note tokens
            foreach (ITextToken tok in tokens)
            {
                if (tok.Text == null || (tok.Locale ?? string.Empty) != preferredLocale)
                {
                    continue;
                }

                if (tok.TextType == TextType.Note)
                {
                    // if a new note is starting finalize any punctuation sequences from the previous note
                    if (tok.IsNoteStart)
                    {
                        noteProcessor.FinalizeResult(desiredKey, m_punctuationSequences, true);
                    }
                    noteProcessor.ProcessToken(tok, desiredKey, m_punctuationSequences);
                }
                else if (tok.TextType == TextType.Verse || tok.TextType == TextType.Other)
                {
                    // body text: finalize any note that was in progress and continue with body text
                    noteProcessor.FinalizeResult(desiredKey, m_punctuationSequences, true);
                    bodyProcessor.ProcessToken(tok, desiredKey, m_punctuationSequences);
                }
                else if (tok.IsParagraphStart)
                {
                    bodyProcessor.FinalizeResult(desiredKey, m_punctuationSequences, true);
                    bodyProcessor.TreatAsParagraphStart = true;
                }
            }

            noteProcessor.FinalizeResult(desiredKey, m_punctuationSequences, true);
            bodyProcessor.FinalizeResult(desiredKey, m_punctuationSequences, true);

            return(m_punctuationSequences);
        }
Пример #10
0
        /// ------------------------------------------------------------------------------------
        /// <summary>
        ///
        /// </summary>
        /// ------------------------------------------------------------------------------------
        public List <TextTokenSubstring> GetReferences(IEnumerable <ITextToken> tokens, string desiredKey)
        {
#if DEBUG
            List <ITextToken> AllTokens = new List <ITextToken>(tokens);
#endif
            m_characterCategorizer = m_checksDataSource.CharacterCategorizer;
            ValidItems             = m_checksDataSource.GetParameterValue(kValidItemsParameter);
            InvalidItems           = m_checksDataSource.GetParameterValue(kInvalidItemsParameter);

            string preferredLocale =
                m_checksDataSource.GetParameterValue("PreferredLocale") ?? string.Empty;

            string poeticStyles =
                m_checksDataSource.GetParameterValue("PoeticStyles");

            string introductionOutlineStyles =
                m_checksDataSource.GetParameterValue("IntroductionOutlineStyles");

            MatchedPairList pairList =
                MatchedPairList.Load(m_checksDataSource.GetParameterValue("MatchedPairs"),
                                     m_checksDataSource.GetParameterValue("DefaultWritingSystemName"));

            StyleCategorizer styleCategorizer =
                new StyleCategorizer(poeticStyles, introductionOutlineStyles);

            ProcessMatchedPairTokens bodyProcessor = new ProcessMatchedPairTokens(
                m_checksDataSource, pairList, styleCategorizer);

            ProcessMatchedPairTokens noteProcessor = new ProcessMatchedPairTokens(
                m_checksDataSource, pairList, styleCategorizer);

            m_unmatchedPairs = new List <TextTokenSubstring>();

            foreach (ITextToken tok in tokens)
            {
                if (tok.Text == null || (tok.Locale ?? string.Empty) != preferredLocale)
                {
                    continue;
                }

                if (tok.TextType == TextType.Note)
                {
                    // if a new note is starting finalize any sequences from the previous note
                    if (tok.IsNoteStart)
                    {
                        noteProcessor.FinalizeResult(desiredKey, m_unmatchedPairs);
                    }
                    noteProcessor.ProcessToken(tok, desiredKey, m_unmatchedPairs);
                }
                else if (tok.TextType == TextType.Verse || tok.TextType == TextType.Other || tok.IsParagraphStart)
                {
                    // body text: finalize any note that was in progress and continue with body text
                    noteProcessor.FinalizeResult(desiredKey, m_unmatchedPairs);
                    bodyProcessor.ProcessToken(tok, desiredKey, m_unmatchedPairs);
                }
            }

            noteProcessor.FinalizeResult(desiredKey, m_unmatchedPairs);
            bodyProcessor.FinalizeResult(desiredKey, m_unmatchedPairs);

            return(m_unmatchedPairs);
        }
Пример #11
0
        /// ------------------------------------------------------------------------------------
        /// <summary>
        /// Get (invalid) character references.
        /// </summary>
        /// ------------------------------------------------------------------------------------
        private List <TextTokenSubstring> GetReferences(IEnumerable <ITextToken> tokens, string desiredKey,
                                                        bool invalidCharactersOnly)
        {
            if (m_categorizer == null)
            {
                m_categorizer = m_checksDataSource.CharacterCategorizer;
            }

            m_characterSequences = new List <TextTokenSubstring>();
            Dictionary <string, Dictionary <string, bool> > htValidChars =
                new Dictionary <string, Dictionary <string, bool> >();
            Dictionary <string, bool> currentDictionary = null;
            string preferredLocale = m_checksDataSource.GetParameterValue("PreferredLocale") ?? string.Empty;

            foreach (ITextToken tok in tokens)
            {
                string locale = tok.Locale ?? string.Empty;

                if (tok.Text == null || (!invalidCharactersOnly && locale != preferredLocale))
                {
                    continue;
                }

                if (!htValidChars.TryGetValue(locale, out currentDictionary))
                {
                    currentDictionary = StringToDictionary(GetValidCharacters(locale));
                    htValidChars.Add(locale, currentDictionary);
                }

                int offset = 0;

                foreach (string key in ParseCharacterSequences(tok.Text))
                {
                    bool lookingForASpecificKey = (desiredKey != "");
                    bool keyMatches             = (desiredKey == key);
                    bool invalidItem            = false;

                    if (invalidCharactersOnly)
                    {
                        // REVIEW (BobbydV): IndexOf causes false positives for certain
                        // characters (e.g., U+0234 & U+1234). I think Contains is easier to read
                        // and should work for both TE and Paratext for the "AlwaysValidCharacters"
                        // list. (TomB)
                        if (!m_alwaysValidCharacters.Contains(key) &&
                            !currentDictionary.ContainsKey(key))
                        {
                            invalidItem = true;
                        }
                    }

                    if ((lookingForASpecificKey && keyMatches) ||
                        (!lookingForASpecificKey && !invalidCharactersOnly) ||
                        (invalidCharactersOnly && invalidItem))
                    {
                        TextTokenSubstring tts = new TextTokenSubstring(tok, offset, key.Length);
                        m_characterSequences.Add(tts);
                    }

                    offset += key.Length;
                }
            }

            return(m_characterSequences);
        }