public void WordAndPuncts_initialSpace() { CharacterCategorizer cat = new CharacterCategorizer("", "", "", "", ""); IEnumerable<WordAndPunct> words = cat.WordAndPuncts(" Dude "); IEnumerator<WordAndPunct> wordCollection = words.GetEnumerator(); Assert.IsTrue(wordCollection.MoveNext()); CheckWordAndPunct(wordCollection.Current, "Dude", " ", 1); Assert.IsFalse(wordCollection.MoveNext()); }
public void WordAndPuncts_initialSpace() { CharacterCategorizer cat = new CharacterCategorizer("", "", "", "", ""); IEnumerable <WordAndPunct> words = cat.WordAndPuncts(" Dude "); IEnumerator <WordAndPunct> wordCollection = words.GetEnumerator(); Assert.IsTrue(wordCollection.MoveNext()); CheckWordAndPunct(wordCollection.Current, "Dude", " ", 1); Assert.IsFalse(wordCollection.MoveNext()); }
/// -------------------------------------------------------------------------------- /// <summary> /// Initializes a new instance of the <see cref="TextFileDataSource"/> class. /// </summary> /// <param name="scrChecksDllFile">The DLL that contains the CharactersCheck class</param> /// <param name="scrCheck">Name of the scripture check to use</param> /// <param name="fileData">An array of strings with the lines of data from the file.</param> /// <param name="scrRefFormatString">Format string used to format scripture references.</param> /// <param name="parameters">Checking parameters to send the check.</param> /// <param name="categorizer">The character categorizer.</param> /// -------------------------------------------------------------------------------- public TextFileDataSource(string scrChecksDllFile, string scrCheck, string[] fileData, string scrRefFormatString, Dictionary<string, string> parameters, CharacterCategorizer categorizer) { m_scrChecksDllFile = scrChecksDllFile; m_scrCheck = scrCheck; m_characterCategorizer = (categorizer != null) ? categorizer : new CharacterCategorizer(); m_params = parameters; m_tftList = new List<ITextToken>(); int i = 1; foreach (string line in fileData) m_tftList.Add(new TextFileToken(line, i++, scrRefFormatString)); }
public void WordAndPuncts_numberInWord() { CharacterCategorizer cat = new CharacterCategorizer("", "", "", "", ""); IEnumerable<WordAndPunct> words = cat.WordAndPuncts("This is test1."); IEnumerator<WordAndPunct> wordCollection = words.GetEnumerator(); Assert.IsTrue(wordCollection.MoveNext()); CheckWordAndPunct(wordCollection.Current, "This", " ", 0); Assert.IsTrue(wordCollection.MoveNext()); CheckWordAndPunct(wordCollection.Current, "is", " ", 5); Assert.IsTrue(wordCollection.MoveNext()); CheckWordAndPunct(wordCollection.Current, "test1", ".", 8); Assert.IsFalse(wordCollection.MoveNext()); }
public void WordAndPuncts_numberInWord() { CharacterCategorizer cat = new CharacterCategorizer("", "", "", "", ""); IEnumerable <WordAndPunct> words = cat.WordAndPuncts("This is test1."); IEnumerator <WordAndPunct> wordCollection = words.GetEnumerator(); Assert.IsTrue(wordCollection.MoveNext()); CheckWordAndPunct(wordCollection.Current, "This", " ", 0); Assert.IsTrue(wordCollection.MoveNext()); CheckWordAndPunct(wordCollection.Current, "is", " ", 5); Assert.IsTrue(wordCollection.MoveNext()); CheckWordAndPunct(wordCollection.Current, "test1", ".", 8); Assert.IsFalse(wordCollection.MoveNext()); }
public void WordAndPuncts_initialSpaceFollowedByNumbers() { CharacterCategorizer cat = new CharacterCategorizer("", "", "", "", ""); IEnumerable <WordAndPunct> words = cat.WordAndPuncts("1 2 3"); IEnumerator <WordAndPunct> wordCollection = words.GetEnumerator(); Assert.IsTrue(wordCollection.MoveNext()); CheckWordAndPunct(wordCollection.Current, "1", " ", 0); Assert.IsTrue(wordCollection.MoveNext()); CheckWordAndPunct(wordCollection.Current, "2", " ", 2); Assert.IsTrue(wordCollection.MoveNext()); CheckWordAndPunct(wordCollection.Current, "3", "", 4); Assert.IsFalse(wordCollection.MoveNext()); }
/// -------------------------------------------------------------------------------- /// <summary> /// Initializes a new instance of the <see cref="TextFileDataSource"/> class. /// </summary> /// <param name="scrChecksDllFile">The DLL that contains the CharactersCheck class</param> /// <param name="scrCheck">Name of the scripture check to use</param> /// <param name="fileData">An array of strings with the lines of data from the file.</param> /// <param name="scrRefFormatString">Format string used to format scripture references.</param> /// <param name="parameters">Checking parameters to send the check.</param> /// <param name="categorizer">The character categorizer.</param> /// -------------------------------------------------------------------------------- public TextFileDataSource(string scrChecksDllFile, string scrCheck, string[] fileData, string scrRefFormatString, Dictionary <string, string> parameters, CharacterCategorizer categorizer) { m_scrChecksDllFile = scrChecksDllFile; m_scrCheck = scrCheck; m_characterCategorizer = (categorizer != null) ? categorizer : new CharacterCategorizer(); m_params = parameters; m_tftList = new List <ITextToken>(); int i = 1; foreach (string line in fileData) { m_tftList.Add(new TextFileToken(line, i++, scrRefFormatString)); } }
public void WordAndPuncts_simple() { CharacterCategorizer cat = new CharacterCategorizer("", "", ""); IEnumerable<WordAndPunct> words = cat.WordAndPuncts("This is my test."); using (IEnumerator<WordAndPunct> wordCollection = words.GetEnumerator()) { Assert.IsTrue(wordCollection.MoveNext()); CheckWordAndPunct(wordCollection.Current, "This", " ", 0); Assert.IsTrue(wordCollection.MoveNext()); CheckWordAndPunct(wordCollection.Current, "is", " ", 5); Assert.IsTrue(wordCollection.MoveNext()); CheckWordAndPunct(wordCollection.Current, "my", " ", 8); Assert.IsTrue(wordCollection.MoveNext()); CheckWordAndPunct(wordCollection.Current, "test", ".", 11); Assert.IsFalse(wordCollection.MoveNext()); } }
public void WordAndPuncts_simple() { CharacterCategorizer cat = new CharacterCategorizer("", "", ""); IEnumerable <WordAndPunct> words = cat.WordAndPuncts("This is my test."); using (IEnumerator <WordAndPunct> wordCollection = words.GetEnumerator()) { Assert.IsTrue(wordCollection.MoveNext()); CheckWordAndPunct(wordCollection.Current, "This", " ", 0); Assert.IsTrue(wordCollection.MoveNext()); CheckWordAndPunct(wordCollection.Current, "is", " ", 5); Assert.IsTrue(wordCollection.MoveNext()); CheckWordAndPunct(wordCollection.Current, "my", " ", 8); Assert.IsTrue(wordCollection.MoveNext()); CheckWordAndPunct(wordCollection.Current, "test", ".", 11); Assert.IsFalse(wordCollection.MoveNext()); } }
/// ------------------------------------------------------------------------------------ /// <summary> /// Initializes a new instance of the <see cref="ProcessSentenceFinalPunct"/> class. /// </summary> /// <param name="checksDataSource">The source of data for Scripture checking.</param> /// <param name="allCapitalizedStyles">Dictionary keyed by the style name containing the /// type of style (character/paragraph) and a value indicating why it should begin with /// a capital.</param> /// ------------------------------------------------------------------------------------ public CapitalizationProcessor(IChecksDataSource checksDataSource, Dictionary<string, StyleCapInfo> allCapitalizedStyles) { m_checksDataSource = checksDataSource; m_categorizer = checksDataSource.CharacterCategorizer; m_abbreviations = checksDataSource.GetParameterValue("Abbreviations").Split(); m_allCapitalizedStyles = allCapitalizedStyles; string sentenceFinalPunc = checksDataSource.GetParameterValue("SentenceFinalPunctuation"); if (!string.IsNullOrEmpty(sentenceFinalPunc)) { foreach (char ch in sentenceFinalPunc) m_validSentenceFinalPuncts.Add(ch); } else { // No punctuation is set up for this writing system that contains sentence-final punctuation. // Define sentence-final punctuation with these characters as a fallback: '.', '?', and '!' m_validSentenceFinalPuncts.Add('.'); m_validSentenceFinalPuncts.Add('?'); m_validSentenceFinalPuncts.Add('!'); } }
/// ------------------------------------------------------------------------------------ /// <summary> /// Return a TextTokenSubstring for all occurances of the desiredKey. /// </summary> /// <param name="tokens"></param> /// <param name="desiredKey">e.g., _[_ or empty string to look for all patterns</param> /// <returns></returns> /// ------------------------------------------------------------------------------------ public List<TextTokenSubstring> GetReferences(IEnumerable<ITextToken> tokens, string desiredKey) { #if DEBUG List<ITextToken> AllTokens = new List<ITextToken>(tokens); if (AllTokens.Count == 0) { // Keep the compiler from complaining about assigning to a variable, but not using it. } #endif m_characterCategorizer = m_checksDataSource.CharacterCategorizer; string sXmlMatchedPairs = m_checksDataSource.GetParameterValue("PunctuationPatterns"); if (sXmlMatchedPairs != null && sXmlMatchedPairs.Trim().Length > 0) { m_validItemsList = new List<string>(); m_invalidItemsList = new List<string>(); PuncPatternsList puncPatternsList = PuncPatternsList.Load(sXmlMatchedPairs, m_checksDataSource.GetParameterValue("DefaultWritingSystemName")); foreach (PuncPattern pattern in puncPatternsList) { if (pattern.Valid) m_validItemsList.Add(pattern.Pattern); else m_invalidItemsList.Add(pattern.Pattern); } } else { ValidItems = m_checksDataSource.GetParameterValue(kValidItemsParameter); InvalidItems = m_checksDataSource.GetParameterValue(kInvalidItemsParameter); } string sLevel = m_checksDataSource.GetParameterValue("PunctCheckLevel"); CheckingLevel level; switch (sLevel) { case "Advanced": level = CheckingLevel.Advanced; break; case "Intermediate": level = CheckingLevel.Intermediate; break; case "Basic": default: level = CheckingLevel.Basic; break; } string sWhitespaceRep = m_checksDataSource.GetParameterValue("PunctWhitespaceChar"); if (!String.IsNullOrEmpty(sWhitespaceRep)) s_whitespaceRep = sWhitespaceRep.Substring(0, 1); string preferredLocale = m_checksDataSource.GetParameterValue("PreferredLocale") ?? string.Empty; QuotationMarkCategorizer quotationCategorizer = new QuotationMarkCategorizer(m_checksDataSource); // create processing state machines, one for body text, one for notes ProcessPunctationTokens bodyProcessor = new ProcessPunctationTokens( m_characterCategorizer, quotationCategorizer, level); ProcessPunctationTokens noteProcessor = new ProcessPunctationTokens( m_characterCategorizer, quotationCategorizer, level); m_punctuationSequences = new List<TextTokenSubstring>(); // build list of note and non-note tokens foreach (ITextToken tok in tokens) { if (tok.Text == null || (tok.Locale ?? string.Empty) != preferredLocale) continue; if (tok.TextType == TextType.Note) { // if a new note is starting finalize any punctuation sequences from the previous note if (tok.IsNoteStart) noteProcessor.FinalizeResult(desiredKey, m_punctuationSequences, true); noteProcessor.ProcessToken(tok, desiredKey, m_punctuationSequences); } else if (tok.TextType == TextType.Verse || tok.TextType == TextType.Other) { // body text: finalize any note that was in progress and continue with body text noteProcessor.FinalizeResult(desiredKey, m_punctuationSequences, true); bodyProcessor.ProcessToken(tok, desiredKey, m_punctuationSequences); } else if (tok.IsParagraphStart) { bodyProcessor.FinalizeResult(desiredKey, m_punctuationSequences, true); bodyProcessor.TreatAsParagraphStart = true; } } noteProcessor.FinalizeResult(desiredKey, m_punctuationSequences, true); bodyProcessor.FinalizeResult(desiredKey, m_punctuationSequences, true); return m_punctuationSequences; }
/// ------------------------------------------------------------------------------------ /// <summary> /// Initializes a new instance of the <see cref="ProcessPunctationTokens"/> class. /// </summary> /// <param name="categorizer">The categorizer.</param> /// <param name="quotationCategorizer">The quotation categorizer.</param> /// <param name="level">Indicator to determine how much to combine contiguous /// punctuation sequences into patterns. Advanced = All contiguous punctuation and /// whitespace characters form a single pattern; Intermediate = Contiguous punctuation /// forms a single pattern (delimeted by whitespace); Basic = Each punctuation character /// stands alone. In all three modes, whitespace before and/or after a punctuation token /// indicates whether is is word-initial, word-medial, word-final, or isolated</param> /// ------------------------------------------------------------------------------------ public ProcessPunctationTokens(CharacterCategorizer categorizer, QuotationMarkCategorizer quotationCategorizer, CheckingLevel level) { m_categorizer = categorizer; m_quotationCategorizer = quotationCategorizer; m_level = level; }
/// ------------------------------------------------------------------------------------ /// <summary> /// Runs the Characters Scripture checks. /// </summary> /// <param name="toks">The Scripture tokens to check.</param> /// <param name="record">Method to record the error.</param> /// ------------------------------------------------------------------------------------ public void Check(IEnumerable<ITextToken> toks, RecordErrorHandler record) { // This method is called in ScrChecksDataSource.cs - RunCheck(IScriptureCheck check) m_categorizer = m_checksDataSource.CharacterCategorizer; // Get parameters needed to run this check. GetParameters(); // Find all invalid characters and place them in 'm_characterSequences' GetReferences(toks, string.Empty, true); foreach (TextTokenSubstring tts in m_characterSequences) { tts.Message = (tts.ToString().Length > 1) ? m_checksDataSource.GetLocalizedString("Invalid or unknown character diacritic combination") : m_checksDataSource.GetLocalizedString("Invalid or unknown character"); record(new RecordErrorEventArgs(tts, CheckId)); } }
/// ------------------------------------------------------------------------------------ /// <summary> /// Get (invalid) character references. /// </summary> /// ------------------------------------------------------------------------------------ private List<TextTokenSubstring> GetReferences(IEnumerable<ITextToken> tokens, string desiredKey, bool invalidCharactersOnly) { if (m_categorizer == null) m_categorizer = m_checksDataSource.CharacterCategorizer; m_characterSequences = new List<TextTokenSubstring>(); Dictionary<string, Dictionary<string, bool>> htValidChars = new Dictionary<string, Dictionary<string, bool>>(); Dictionary<string, bool> currentDictionary = null; string preferredLocale = m_checksDataSource.GetParameterValue("PreferredLocale") ?? string.Empty; foreach (ITextToken tok in tokens) { string locale = tok.Locale ?? string.Empty; if (tok.Text == null || (!invalidCharactersOnly && locale != preferredLocale)) continue; if (!htValidChars.TryGetValue(locale, out currentDictionary)) { currentDictionary = StringToDictionary(GetValidCharacters(locale)); htValidChars.Add(locale, currentDictionary); } int offset = 0; foreach (string key in ParseCharacterSequences(tok.Text)) { bool lookingForASpecificKey = (desiredKey != ""); bool keyMatches = (desiredKey == key); bool invalidItem = false; if (invalidCharactersOnly) { // REVIEW (BobbydV): IndexOf causes false positives for certain // characters (e.g., U+0234 & U+1234). I think Contains is easier to read // and should work for both TE and Paratext for the "AlwaysValidCharacters" // list. (TomB) if (!m_alwaysValidCharacters.Contains(key) && !currentDictionary.ContainsKey(key)) invalidItem = true; } if ((lookingForASpecificKey && keyMatches) || (!lookingForASpecificKey && !invalidCharactersOnly) || (invalidCharactersOnly && invalidItem)) { TextTokenSubstring tts = new TextTokenSubstring(tok, offset, key.Length); m_characterSequences.Add(tts); } offset += key.Length; } } return m_characterSequences; }
/// ------------------------------------------------------------------------------------ /// <summary> /// /// </summary> /// <param name="checksDataSource"></param> /// <param name="result"></param> /// ------------------------------------------------------------------------------------ public ProcessMixedCapitalization(IChecksDataSource checksDataSource, List<TextTokenSubstring> result) { m_categorizer = checksDataSource.CharacterCategorizer; m_result = result; m_uncapitalizedPrefixes = new List<string>( checksDataSource.GetParameterValue("UncapitalizedPrefixes").Split()); m_capitalizedSuffixes = new List<string>( checksDataSource.GetParameterValue("CapitalizedSuffixes").Split()); m_capitalizedPrefixes = new List<string>( checksDataSource.GetParameterValue("CapitalizedPrefixes").Split()); }
public void WordAndPuncts_initialSpaceFollowedByNumbers() { CharacterCategorizer cat = new CharacterCategorizer("", "", "", "", ""); IEnumerable<WordAndPunct> words = cat.WordAndPuncts("1 2 3"); IEnumerator<WordAndPunct> wordCollection = words.GetEnumerator(); Assert.IsTrue(wordCollection.MoveNext()); CheckWordAndPunct(wordCollection.Current, "1", " ", 0); Assert.IsTrue(wordCollection.MoveNext()); CheckWordAndPunct(wordCollection.Current, "2", " ", 2); Assert.IsTrue(wordCollection.MoveNext()); CheckWordAndPunct(wordCollection.Current, "3", "", 4); Assert.IsFalse(wordCollection.MoveNext()); }
/// ------------------------------------------------------------------------------------ /// <summary> /// /// </summary> /// <param name="text"></param> /// <param name="categorizer"></param> /// ------------------------------------------------------------------------------------ public AWord(string text, CharacterCategorizer categorizer) { this.m_text = text; this.m_categorizer = categorizer; string word = CountLettersAndReturnWordWithOnlyWordFormingCharacters(text); if (m_lowerCaseLetters == 0 || m_upperCaseLetters == 0) return; FindPrefixAndSuffixIfAny(word); }
/// ------------------------------------------------------------------------------------ /// <summary> /// Gets a list if TextTokenSubstrings containing the references and character offsets /// where quotation problems occur. /// </summary> /// <param name="tokens">The tokens (from the data source) to check for quotation problems.</param> /// <param name="desiredKey">empty string.</param> /// ------------------------------------------------------------------------------------ public List<TextTokenSubstring> GetReferences(IEnumerable<ITextToken> tokens, string desiredKey) { m_charCategorizer = m_chkDataSource.CharacterCategorizer; ValidItems = m_chkDataSource.GetParameterValue(m_validItemsParameter); InvalidItems = m_chkDataSource.GetParameterValue(m_invalidItemsParameter); QuotationMarkCategorizer qmCategorizer = new QuotationMarkCategorizer(m_chkDataSource); m_qmProblems = new List<TextTokenSubstring>(); QTokenProcessor bodyProcessor = new QTokenProcessor(m_chkDataSource, m_charCategorizer, qmCategorizer, desiredKey, m_qmProblems); QTokenProcessor noteProcessor = new QTokenProcessor(m_chkDataSource, m_charCategorizer, qmCategorizer, desiredKey, m_qmProblems); VerseTextToken scrToken = new VerseTextToken(); foreach (ITextToken tok in tokens) { if (tok.TextType == TextType.Note) { // If a new note is starting finalize any sequences from the previous note. if (tok.IsNoteStart) noteProcessor.FinalizeResult(); noteProcessor.ProcessToken(tok, null); } else if (tok.TextType == TextType.Verse || tok.TextType == TextType.Other || tok.IsParagraphStart) { scrToken.Token = tok; // body text: finalize any note that was in progress and continue with body text noteProcessor.FinalizeResult(); bodyProcessor.ProcessToken(tok, scrToken); } } noteProcessor.FinalizeResult(); bodyProcessor.FinalizeResult(); return m_qmProblems; }
/// ------------------------------------------------------------------------------------ /// <summary> /// Gets a list if TextTokenSubstrings conataining the references and character offsets /// where repeated words occur. /// </summary> /// <param name="tokens">The tokens (from the data source) to check for repeated words. /// </param> /// <param name="_desiredKey">If looking for occurrences of a specific repeated word, /// set this to be that word; otherwise pass an empty string.</param> /// <returns></returns> /// ------------------------------------------------------------------------------------ public List<TextTokenSubstring> GetReferences(IEnumerable<ITextToken> tokens, string desiredKey) { #if DEBUG List<ITextToken> AllTokens = new List<ITextToken>(tokens); if (AllTokens.Count == 0) { // Keep the compiler from complaining about assigning to a variable, but not using it. } #endif characterCategorizer = m_checksDataSource.CharacterCategorizer; // Get a string of words that may be validly repeated. // Words are separated by blanks. ValidItems = m_checksDataSource.GetParameterValue("RepeatableWords"); // List of words that are known to be not repeatable. InvalidItems = m_checksDataSource.GetParameterValue("NonRepeatableWords"); TextType prevTextType = TextType.Other; m_repeatedWords = new List<TextTokenSubstring>(); ProcessRepeatedWords bodyProcessor = new ProcessRepeatedWords(characterCategorizer, m_repeatedWords, desiredKey); ProcessRepeatedWords noteProcessor = new ProcessRepeatedWords(characterCategorizer, m_repeatedWords, desiredKey); foreach (ITextToken tok in tokens) { if (tok.IsParagraphStart) { noteProcessor.Reset(); bodyProcessor.Reset(); } if (tok.TextType == TextType.Note) { if (tok.IsNoteStart) noteProcessor.Reset(); noteProcessor.ProcessToken(tok); } // When we leave a caption, we start over checking for repeated words. // A caption is a start of a paragraph, so we already start over // when we encounter a picture caption. if (prevTextType == TextType.PictureCaption) noteProcessor.Reset(); if (tok.TextType == TextType.Verse || tok.TextType == TextType.Other) { noteProcessor.Reset(); bodyProcessor.ProcessToken(tok); } if (tok.TextType == TextType.ChapterNumber) bodyProcessor.Reset(); prevTextType = tok.TextType; } return m_repeatedWords; }
/// ------------------------------------------------------------------------------------ /// <summary> /// /// </summary> /// ------------------------------------------------------------------------------------ public List<TextTokenSubstring> GetReferences(IEnumerable<ITextToken> tokens, string desiredKey) { #if DEBUG List<ITextToken> AllTokens = new List<ITextToken>(tokens); #endif m_characterCategorizer = m_checksDataSource.CharacterCategorizer; ValidItems = m_checksDataSource.GetParameterValue(kValidItemsParameter); InvalidItems = m_checksDataSource.GetParameterValue(kInvalidItemsParameter); string preferredLocale = m_checksDataSource.GetParameterValue("PreferredLocale") ?? string.Empty; string poeticStyles = m_checksDataSource.GetParameterValue("PoeticStyles"); string introductionOutlineStyles = m_checksDataSource.GetParameterValue("IntroductionOutlineStyles"); MatchedPairList pairList = MatchedPairList.Load(m_checksDataSource.GetParameterValue("MatchedPairs"), m_checksDataSource.GetParameterValue("DefaultWritingSystemName")); StyleCategorizer styleCategorizer = new StyleCategorizer(poeticStyles, introductionOutlineStyles); ProcessMatchedPairTokens bodyProcessor = new ProcessMatchedPairTokens( m_checksDataSource, pairList, styleCategorizer); ProcessMatchedPairTokens noteProcessor = new ProcessMatchedPairTokens( m_checksDataSource, pairList, styleCategorizer); m_unmatchedPairs = new List<TextTokenSubstring>(); foreach (ITextToken tok in tokens) { if (tok.Text == null || (tok.Locale ?? string.Empty) != preferredLocale) continue; if (tok.TextType == TextType.Note) { // if a new note is starting finalize any sequences from the previous note if (tok.IsNoteStart) noteProcessor.FinalizeResult(desiredKey, m_unmatchedPairs); noteProcessor.ProcessToken(tok, desiredKey, m_unmatchedPairs); } else if (tok.TextType == TextType.Verse || tok.TextType == TextType.Other || tok.IsParagraphStart) { // body text: finalize any note that was in progress and continue with body text noteProcessor.FinalizeResult(desiredKey, m_unmatchedPairs); bodyProcessor.ProcessToken(tok, desiredKey, m_unmatchedPairs); } } noteProcessor.FinalizeResult(desiredKey, m_unmatchedPairs); bodyProcessor.FinalizeResult(desiredKey, m_unmatchedPairs); return m_unmatchedPairs; }
/// ------------------------------------------------------------------------------------ /// <summary> /// Initializes a new instance of the <see cref="QuotationRelatedTokensProcessor"/> class. /// </summary> /// <param name="source">The checks data source.</param> /// <param name="charCategorizer">The character categorizer.</param> /// <param name="qmCategorizer">The quotation mark categorizer.</param> /// <param name="desiredKey">The desired key (can be string.Empty).</param> /// <param name="results">The result.</param> /// ------------------------------------------------------------------------------------ internal QTokenProcessor(IChecksDataSource dataSource, CharacterCategorizer charCategorizer, QuotationMarkCategorizer qmCategorizer, string desiredKey, List<TextTokenSubstring> results) { m_chkDataSource = dataSource; m_charCategorizer = charCategorizer; m_qmCategorizer = qmCategorizer; m_desiredKey = desiredKey; m_results = results; m_verboseQuotes = (m_chkDataSource.GetParameterValue("VerboseQuotes") == "Yes"); m_noCloserMsg = Localize("Unmatched opening mark: level {0}"); m_noOpenerMsg = Localize("Unmatched closing mark: level {0}"); m_regExQuotes = new Regex(qmCategorizer.Pattern); m_regExNonQuotes = new Regex(string.Format("[^{0}|\\s]", qmCategorizer.Pattern.Replace("]", "\\]"))); // Make sure brackets are escaped }
public ProcessRepeatedWords(CharacterCategorizer characterCategorizer, List<TextTokenSubstring> result, string desiredKey) { this.characterCategorizer = characterCategorizer; this.result = result; this.desiredKey = desiredKey.ToLower(); }
/// ------------------------------------------------------------------------------------ /// <summary> /// Get all instances of the item being checked in the token list passed. /// This includes both valid and invalid instances. /// This is used 1) to create an inventory of these items. /// To show the user all instance of an item with a specified key. /// 2) With a "desiredKey" in order to fetch instance of a specific /// item (e.g. all the places where "the" is a repeated word. /// </summary> /// <param name="tokens">Tokens for text to be scanned</param> /// <param name="desiredKey">If you only want instance of a specific key (e.g. one word, /// one punctuation pattern, one character, etc.) place it here. Empty string returns /// all items.</param> /// <returns>List of token substrings</returns> /// ------------------------------------------------------------------------------------ public List<TextTokenSubstring> GetReferences(IEnumerable<ITextToken> tokens, string desiredKey) { #if DEBUG List<ITextToken> AllTokens = new List<ITextToken>(tokens); #endif m_characterCategorizer = m_checksDataSource.CharacterCategorizer; ValidItems = m_checksDataSource.GetParameterValue(kValidItemsParameter); InvalidItems = m_checksDataSource.GetParameterValue(kInvalidItemsParameter); string preferredLocale = m_checksDataSource.GetParameterValue("PreferredLocale") ?? string.Empty; m_mixedCapitalization = new List<TextTokenSubstring>(); ProcessMixedCapitalization processor = new ProcessMixedCapitalization(m_checksDataSource, m_mixedCapitalization); foreach (ITextToken tok in tokens) { if ((tok.Locale ?? string.Empty) != preferredLocale) continue; foreach (WordAndPunct wap in m_characterCategorizer.WordAndPuncts(tok.Text)) processor.ProcessWord(tok, wap, desiredKey); } return m_mixedCapitalization; }