Exemple #1
0
		public void WordAndPuncts_initialSpace()
		{
			CharacterCategorizer cat = new CharacterCategorizer("", "", "", "", "");
			IEnumerable<WordAndPunct> words = cat.WordAndPuncts(" Dude ");
			IEnumerator<WordAndPunct> wordCollection = words.GetEnumerator();
			Assert.IsTrue(wordCollection.MoveNext());
			CheckWordAndPunct(wordCollection.Current, "Dude", " ", 1);
			Assert.IsFalse(wordCollection.MoveNext());
		}
Exemple #2
0
        public void WordAndPuncts_initialSpace()
        {
            CharacterCategorizer       cat            = new CharacterCategorizer("", "", "", "", "");
            IEnumerable <WordAndPunct> words          = cat.WordAndPuncts(" Dude ");
            IEnumerator <WordAndPunct> wordCollection = words.GetEnumerator();

            Assert.IsTrue(wordCollection.MoveNext());
            CheckWordAndPunct(wordCollection.Current, "Dude", " ", 1);
            Assert.IsFalse(wordCollection.MoveNext());
        }
		/// --------------------------------------------------------------------------------
		/// <summary>
		/// Initializes a new instance of the <see cref="TextFileDataSource"/> class.
		/// </summary>
		/// <param name="scrChecksDllFile">The DLL that contains the CharactersCheck class</param>
		/// <param name="scrCheck">Name of the scripture check to use</param>
		/// <param name="fileData">An array of strings with the lines of data from the file.</param>
		/// <param name="scrRefFormatString">Format string used to format scripture references.</param>
		/// <param name="parameters">Checking parameters to send the check.</param>
		/// <param name="categorizer">The character categorizer.</param>
		/// --------------------------------------------------------------------------------
		public TextFileDataSource(string scrChecksDllFile, string scrCheck, string[] fileData,
			string scrRefFormatString, Dictionary<string, string> parameters,
			CharacterCategorizer categorizer)
		{
			m_scrChecksDllFile = scrChecksDllFile;
			m_scrCheck = scrCheck;
			m_characterCategorizer = (categorizer != null) ? categorizer : new CharacterCategorizer();
			m_params = parameters;
			m_tftList = new List<ITextToken>();
			int i = 1;
			foreach (string line in fileData)
				m_tftList.Add(new TextFileToken(line, i++, scrRefFormatString));
		}
Exemple #4
0
		public void WordAndPuncts_numberInWord()
		{
			CharacterCategorizer cat = new CharacterCategorizer("", "", "", "", "");
			IEnumerable<WordAndPunct> words = cat.WordAndPuncts("This is test1.");
			IEnumerator<WordAndPunct> wordCollection = words.GetEnumerator();
			Assert.IsTrue(wordCollection.MoveNext());
			CheckWordAndPunct(wordCollection.Current, "This", " ", 0);
			Assert.IsTrue(wordCollection.MoveNext());
			CheckWordAndPunct(wordCollection.Current, "is", " ", 5);
			Assert.IsTrue(wordCollection.MoveNext());
			CheckWordAndPunct(wordCollection.Current, "test1", ".", 8);
			Assert.IsFalse(wordCollection.MoveNext());
		}
Exemple #5
0
        public void WordAndPuncts_numberInWord()
        {
            CharacterCategorizer       cat            = new CharacterCategorizer("", "", "", "", "");
            IEnumerable <WordAndPunct> words          = cat.WordAndPuncts("This is test1.");
            IEnumerator <WordAndPunct> wordCollection = words.GetEnumerator();

            Assert.IsTrue(wordCollection.MoveNext());
            CheckWordAndPunct(wordCollection.Current, "This", " ", 0);
            Assert.IsTrue(wordCollection.MoveNext());
            CheckWordAndPunct(wordCollection.Current, "is", " ", 5);
            Assert.IsTrue(wordCollection.MoveNext());
            CheckWordAndPunct(wordCollection.Current, "test1", ".", 8);
            Assert.IsFalse(wordCollection.MoveNext());
        }
Exemple #6
0
        public void WordAndPuncts_initialSpaceFollowedByNumbers()
        {
            CharacterCategorizer       cat            = new CharacterCategorizer("", "", "", "", "");
            IEnumerable <WordAndPunct> words          = cat.WordAndPuncts("1 2 3");
            IEnumerator <WordAndPunct> wordCollection = words.GetEnumerator();

            Assert.IsTrue(wordCollection.MoveNext());
            CheckWordAndPunct(wordCollection.Current, "1", " ", 0);
            Assert.IsTrue(wordCollection.MoveNext());
            CheckWordAndPunct(wordCollection.Current, "2", " ", 2);
            Assert.IsTrue(wordCollection.MoveNext());
            CheckWordAndPunct(wordCollection.Current, "3", "", 4);
            Assert.IsFalse(wordCollection.MoveNext());
        }
        /// --------------------------------------------------------------------------------
        /// <summary>
        /// Initializes a new instance of the <see cref="TextFileDataSource"/> class.
        /// </summary>
        /// <param name="scrChecksDllFile">The DLL that contains the CharactersCheck class</param>
        /// <param name="scrCheck">Name of the scripture check to use</param>
        /// <param name="fileData">An array of strings with the lines of data from the file.</param>
        /// <param name="scrRefFormatString">Format string used to format scripture references.</param>
        /// <param name="parameters">Checking parameters to send the check.</param>
        /// <param name="categorizer">The character categorizer.</param>
        /// --------------------------------------------------------------------------------
        public TextFileDataSource(string scrChecksDllFile, string scrCheck, string[] fileData,
                                  string scrRefFormatString, Dictionary <string, string> parameters,
                                  CharacterCategorizer categorizer)
        {
            m_scrChecksDllFile     = scrChecksDllFile;
            m_scrCheck             = scrCheck;
            m_characterCategorizer = (categorizer != null) ? categorizer : new CharacterCategorizer();
            m_params  = parameters;
            m_tftList = new List <ITextToken>();
            int i = 1;

            foreach (string line in fileData)
            {
                m_tftList.Add(new TextFileToken(line, i++, scrRefFormatString));
            }
        }
		public void WordAndPuncts_simple()
		{
			CharacterCategorizer cat = new CharacterCategorizer("", "", "");
			IEnumerable<WordAndPunct> words = cat.WordAndPuncts("This is my test.");
			using (IEnumerator<WordAndPunct> wordCollection = words.GetEnumerator())
			{
				Assert.IsTrue(wordCollection.MoveNext());
				CheckWordAndPunct(wordCollection.Current, "This", " ", 0);
				Assert.IsTrue(wordCollection.MoveNext());
				CheckWordAndPunct(wordCollection.Current, "is", " ", 5);
				Assert.IsTrue(wordCollection.MoveNext());
				CheckWordAndPunct(wordCollection.Current, "my", " ", 8);
				Assert.IsTrue(wordCollection.MoveNext());
				CheckWordAndPunct(wordCollection.Current, "test", ".", 11);
				Assert.IsFalse(wordCollection.MoveNext());
			}
		}
        public void WordAndPuncts_simple()
        {
            CharacterCategorizer       cat   = new CharacterCategorizer("", "", "");
            IEnumerable <WordAndPunct> words = cat.WordAndPuncts("This is my test.");

            using (IEnumerator <WordAndPunct> wordCollection = words.GetEnumerator())
            {
                Assert.IsTrue(wordCollection.MoveNext());
                CheckWordAndPunct(wordCollection.Current, "This", " ", 0);
                Assert.IsTrue(wordCollection.MoveNext());
                CheckWordAndPunct(wordCollection.Current, "is", " ", 5);
                Assert.IsTrue(wordCollection.MoveNext());
                CheckWordAndPunct(wordCollection.Current, "my", " ", 8);
                Assert.IsTrue(wordCollection.MoveNext());
                CheckWordAndPunct(wordCollection.Current, "test", ".", 11);
                Assert.IsFalse(wordCollection.MoveNext());
            }
        }
		/// ------------------------------------------------------------------------------------
		/// <summary>
		/// Initializes a new instance of the <see cref="ProcessSentenceFinalPunct"/> class.
		/// </summary>
		/// <param name="checksDataSource">The source of data for Scripture checking.</param>
		/// <param name="allCapitalizedStyles">Dictionary keyed by the style name containing the
		/// type of style (character/paragraph) and a value indicating why it should begin with
		/// a capital.</param>
		/// ------------------------------------------------------------------------------------
		public CapitalizationProcessor(IChecksDataSource checksDataSource,
			Dictionary<string, StyleCapInfo> allCapitalizedStyles)
		{
			m_checksDataSource = checksDataSource;
			m_categorizer = checksDataSource.CharacterCategorizer;
			m_abbreviations = checksDataSource.GetParameterValue("Abbreviations").Split();
			m_allCapitalizedStyles = allCapitalizedStyles;

			string sentenceFinalPunc = checksDataSource.GetParameterValue("SentenceFinalPunctuation");
			if (!string.IsNullOrEmpty(sentenceFinalPunc))
			{
				foreach (char ch in sentenceFinalPunc)
					m_validSentenceFinalPuncts.Add(ch);
			}
			else
			{
				// No punctuation is set up for this writing system that contains sentence-final punctuation.
				// Define sentence-final punctuation with these characters as a fallback: '.', '?', and '!'
				m_validSentenceFinalPuncts.Add('.');
				m_validSentenceFinalPuncts.Add('?');
				m_validSentenceFinalPuncts.Add('!');
			}
		}
		/// ------------------------------------------------------------------------------------
		/// <summary>
		/// Return a TextTokenSubstring for all occurances of the desiredKey.
		/// </summary>
		/// <param name="tokens"></param>
		/// <param name="desiredKey">e.g., _[_ or empty string to look for all patterns</param>
		/// <returns></returns>
		/// ------------------------------------------------------------------------------------
		public List<TextTokenSubstring> GetReferences(IEnumerable<ITextToken> tokens, string desiredKey)
		{
#if DEBUG
			List<ITextToken> AllTokens = new List<ITextToken>(tokens);
			if (AllTokens.Count == 0)
			{
				// Keep the compiler from complaining about assigning to a variable, but not using it.
			}
#endif
			m_characterCategorizer = m_checksDataSource.CharacterCategorizer;
			string sXmlMatchedPairs = m_checksDataSource.GetParameterValue("PunctuationPatterns");
			if (sXmlMatchedPairs != null && sXmlMatchedPairs.Trim().Length > 0)
			{
				m_validItemsList = new List<string>();
				m_invalidItemsList = new List<string>();
				PuncPatternsList puncPatternsList = PuncPatternsList.Load(sXmlMatchedPairs,
					m_checksDataSource.GetParameterValue("DefaultWritingSystemName"));
				foreach (PuncPattern pattern in puncPatternsList)
				{
					if (pattern.Valid)
						m_validItemsList.Add(pattern.Pattern);
					else
						m_invalidItemsList.Add(pattern.Pattern);
				}
			}
			else
			{
				ValidItems = m_checksDataSource.GetParameterValue(kValidItemsParameter);
				InvalidItems = m_checksDataSource.GetParameterValue(kInvalidItemsParameter);
			}

			string sLevel = m_checksDataSource.GetParameterValue("PunctCheckLevel");
			CheckingLevel level;
			switch (sLevel)
			{
				case "Advanced": level = CheckingLevel.Advanced; break;
				case "Intermediate": level = CheckingLevel.Intermediate; break;
				case "Basic":
				default:
					level = CheckingLevel.Basic;
					break;
			}
			string sWhitespaceRep = m_checksDataSource.GetParameterValue("PunctWhitespaceChar");
			if (!String.IsNullOrEmpty(sWhitespaceRep))
				s_whitespaceRep = sWhitespaceRep.Substring(0, 1);
			string preferredLocale =
				m_checksDataSource.GetParameterValue("PreferredLocale") ?? string.Empty;

			QuotationMarkCategorizer quotationCategorizer =
				new QuotationMarkCategorizer(m_checksDataSource);

			// create processing state machines, one for body text, one for notes
			ProcessPunctationTokens bodyProcessor = new ProcessPunctationTokens(
				m_characterCategorizer, quotationCategorizer, level);

			ProcessPunctationTokens noteProcessor =	new ProcessPunctationTokens(
				m_characterCategorizer, quotationCategorizer, level);

			m_punctuationSequences = new List<TextTokenSubstring>();

			// build list of note and non-note tokens
			foreach (ITextToken tok in tokens)
			{
				if (tok.Text == null || (tok.Locale ?? string.Empty) != preferredLocale)
					continue;

				if (tok.TextType == TextType.Note)
				{
					// if a new note is starting finalize any punctuation sequences from the previous note
					if (tok.IsNoteStart)
						noteProcessor.FinalizeResult(desiredKey, m_punctuationSequences, true);
					noteProcessor.ProcessToken(tok, desiredKey, m_punctuationSequences);
				}
				else if (tok.TextType == TextType.Verse || tok.TextType == TextType.Other)
				{
					// body text: finalize any note that was in progress and continue with body text
					noteProcessor.FinalizeResult(desiredKey, m_punctuationSequences, true);
					bodyProcessor.ProcessToken(tok, desiredKey, m_punctuationSequences);
				}
				else if (tok.IsParagraphStart)
				{
					bodyProcessor.FinalizeResult(desiredKey, m_punctuationSequences, true);
					bodyProcessor.TreatAsParagraphStart = true;
				}
			}

			noteProcessor.FinalizeResult(desiredKey, m_punctuationSequences, true);
			bodyProcessor.FinalizeResult(desiredKey, m_punctuationSequences, true);

			return m_punctuationSequences;
		}
		/// ------------------------------------------------------------------------------------
		/// <summary>
		/// Initializes a new instance of the <see cref="ProcessPunctationTokens"/> class.
		/// </summary>
		/// <param name="categorizer">The categorizer.</param>
		/// <param name="quotationCategorizer">The quotation categorizer.</param>
		/// <param name="level">Indicator to determine how much to combine contiguous
		/// punctuation sequences into patterns. Advanced = All contiguous punctuation and
		/// whitespace characters form a single pattern; Intermediate = Contiguous punctuation
		/// forms a single pattern (delimeted by whitespace); Basic = Each punctuation character
		/// stands alone. In all three modes, whitespace before and/or after a punctuation token
		/// indicates whether is is word-initial, word-medial, word-final, or isolated</param>
		/// ------------------------------------------------------------------------------------
		public ProcessPunctationTokens(CharacterCategorizer categorizer,
			QuotationMarkCategorizer quotationCategorizer, CheckingLevel level)
		{
			m_categorizer = categorizer;
			m_quotationCategorizer = quotationCategorizer;
			m_level = level;
		}
		/// ------------------------------------------------------------------------------------
		/// <summary>
		/// Runs the Characters Scripture checks.
		/// </summary>
		/// <param name="toks">The Scripture tokens to check.</param>
		/// <param name="record">Method to record the error.</param>
		/// ------------------------------------------------------------------------------------
		public void Check(IEnumerable<ITextToken> toks, RecordErrorHandler record)
		{
			// This method is called in ScrChecksDataSource.cs - RunCheck(IScriptureCheck check)
			m_categorizer = m_checksDataSource.CharacterCategorizer;

			// Get parameters needed to run this check.
			GetParameters();

			// Find all invalid characters and place them in 'm_characterSequences'
			GetReferences(toks, string.Empty, true);

			foreach (TextTokenSubstring tts in m_characterSequences)
			{
				tts.Message = (tts.ToString().Length > 1) ?
					m_checksDataSource.GetLocalizedString("Invalid or unknown character diacritic combination") :
					m_checksDataSource.GetLocalizedString("Invalid or unknown character");

				record(new RecordErrorEventArgs(tts, CheckId));
			}
		}
		/// ------------------------------------------------------------------------------------
		/// <summary>
		/// Get (invalid) character references.
		/// </summary>
		/// ------------------------------------------------------------------------------------
		private List<TextTokenSubstring> GetReferences(IEnumerable<ITextToken> tokens, string desiredKey,
			bool invalidCharactersOnly)
		{
			if (m_categorizer == null)
				m_categorizer = m_checksDataSource.CharacterCategorizer;

			m_characterSequences = new List<TextTokenSubstring>();
			Dictionary<string, Dictionary<string, bool>> htValidChars =
				new Dictionary<string, Dictionary<string, bool>>();
			Dictionary<string, bool> currentDictionary = null;
			string preferredLocale = m_checksDataSource.GetParameterValue("PreferredLocale") ?? string.Empty;

			foreach (ITextToken tok in tokens)
			{
				string locale = tok.Locale ?? string.Empty;

				if (tok.Text == null || (!invalidCharactersOnly && locale != preferredLocale))
					continue;

				if (!htValidChars.TryGetValue(locale, out currentDictionary))
				{
					currentDictionary = StringToDictionary(GetValidCharacters(locale));
					htValidChars.Add(locale, currentDictionary);
				}

				int offset = 0;

				foreach (string key in ParseCharacterSequences(tok.Text))
				{
					bool lookingForASpecificKey = (desiredKey != "");
					bool keyMatches = (desiredKey == key);
					bool invalidItem = false;

					if (invalidCharactersOnly)
					{
						// REVIEW (BobbydV): IndexOf causes false positives for certain
						// characters (e.g., U+0234 & U+1234). I think Contains is easier to read
						// and should work for both TE and Paratext for the "AlwaysValidCharacters"
						// list. (TomB)
						if (!m_alwaysValidCharacters.Contains(key) &&
							!currentDictionary.ContainsKey(key))
							invalidItem = true;
					}

					if ((lookingForASpecificKey && keyMatches) ||
						(!lookingForASpecificKey && !invalidCharactersOnly) ||
						(invalidCharactersOnly && invalidItem))
					{
						TextTokenSubstring tts = new TextTokenSubstring(tok, offset, key.Length);
						m_characterSequences.Add(tts);
					}

					offset += key.Length;
				}
			}

			return m_characterSequences;
		}
		/// ------------------------------------------------------------------------------------
		/// <summary>
		///
		/// </summary>
		/// <param name="checksDataSource"></param>
		/// <param name="result"></param>
		/// ------------------------------------------------------------------------------------
		public ProcessMixedCapitalization(IChecksDataSource checksDataSource,
			List<TextTokenSubstring> result)
		{
			m_categorizer = checksDataSource.CharacterCategorizer;
			m_result = result;

			m_uncapitalizedPrefixes = new List<string>(
				checksDataSource.GetParameterValue("UncapitalizedPrefixes").Split());

			m_capitalizedSuffixes =	new List<string>(
				checksDataSource.GetParameterValue("CapitalizedSuffixes").Split());

			m_capitalizedPrefixes =	new List<string>(
				checksDataSource.GetParameterValue("CapitalizedPrefixes").Split());
		}
Exemple #16
0
		public void WordAndPuncts_initialSpaceFollowedByNumbers()
		{
			CharacterCategorizer cat = new CharacterCategorizer("", "", "", "", "");
			IEnumerable<WordAndPunct> words = cat.WordAndPuncts("1 2 3");
			IEnumerator<WordAndPunct> wordCollection = words.GetEnumerator();
			Assert.IsTrue(wordCollection.MoveNext());
			CheckWordAndPunct(wordCollection.Current, "1", " ", 0);
			Assert.IsTrue(wordCollection.MoveNext());
			CheckWordAndPunct(wordCollection.Current, "2", " ", 2);
			Assert.IsTrue(wordCollection.MoveNext());
			CheckWordAndPunct(wordCollection.Current, "3", "", 4);
			Assert.IsFalse(wordCollection.MoveNext());
		}
		/// ------------------------------------------------------------------------------------
		/// <summary>
		///
		/// </summary>
		/// <param name="text"></param>
		/// <param name="categorizer"></param>
		/// ------------------------------------------------------------------------------------
		public AWord(string text, CharacterCategorizer categorizer)
		{
			this.m_text = text;
			this.m_categorizer = categorizer;

			string word = CountLettersAndReturnWordWithOnlyWordFormingCharacters(text);
			if (m_lowerCaseLetters == 0 || m_upperCaseLetters == 0)
				return;
			FindPrefixAndSuffixIfAny(word);
		}
Exemple #18
0
		/// ------------------------------------------------------------------------------------
		/// <summary>
		/// Gets a list if TextTokenSubstrings containing the references and character offsets
		/// where quotation problems occur.
		/// </summary>
		/// <param name="tokens">The tokens (from the data source) to check for quotation problems.</param>
		/// <param name="desiredKey">empty string.</param>
		/// ------------------------------------------------------------------------------------
		public List<TextTokenSubstring> GetReferences(IEnumerable<ITextToken> tokens, string desiredKey)
		{
			m_charCategorizer = m_chkDataSource.CharacterCategorizer;
			ValidItems = m_chkDataSource.GetParameterValue(m_validItemsParameter);
			InvalidItems = m_chkDataSource.GetParameterValue(m_invalidItemsParameter);

			QuotationMarkCategorizer qmCategorizer = new QuotationMarkCategorizer(m_chkDataSource);
			m_qmProblems = new List<TextTokenSubstring>();

			QTokenProcessor bodyProcessor =	new QTokenProcessor(m_chkDataSource,
				m_charCategorizer, qmCategorizer, desiredKey, m_qmProblems);

			QTokenProcessor noteProcessor =	new QTokenProcessor(m_chkDataSource,
				m_charCategorizer, qmCategorizer, desiredKey, m_qmProblems);

			VerseTextToken scrToken = new VerseTextToken();
			foreach (ITextToken tok in tokens)
			{
				if (tok.TextType == TextType.Note)
				{
					// If a new note is starting finalize any sequences from the previous note.
					if (tok.IsNoteStart)
						noteProcessor.FinalizeResult();
					noteProcessor.ProcessToken(tok, null);
				}
				else if (tok.TextType == TextType.Verse || tok.TextType == TextType.Other ||
					tok.IsParagraphStart)
				{
					scrToken.Token = tok;
					// body text: finalize any note that was in progress and continue with body text
					noteProcessor.FinalizeResult();
					bodyProcessor.ProcessToken(tok, scrToken);
				}
			}

			noteProcessor.FinalizeResult();
			bodyProcessor.FinalizeResult();
			return m_qmProblems;
		}
		/// ------------------------------------------------------------------------------------
		/// <summary>
		/// Gets a list if TextTokenSubstrings conataining the references and character offsets
		/// where repeated words occur.
		/// </summary>
		/// <param name="tokens">The tokens (from the data source) to check for repeated words.
		/// </param>
		/// <param name="_desiredKey">If looking for occurrences of a specific repeated word,
		/// set this to be that word; otherwise pass an empty string.</param>
		/// <returns></returns>
		/// ------------------------------------------------------------------------------------
		public List<TextTokenSubstring> GetReferences(IEnumerable<ITextToken> tokens, string desiredKey)
		{
#if DEBUG
			List<ITextToken> AllTokens = new List<ITextToken>(tokens);
			if (AllTokens.Count == 0)
			{
				// Keep the compiler from complaining about assigning to a variable, but not using it.
			}
#endif
			characterCategorizer = m_checksDataSource.CharacterCategorizer;
			// Get a string of words that may be validly repeated.
			// Words are separated by blanks.
			ValidItems = m_checksDataSource.GetParameterValue("RepeatableWords");
			// List of words that are known to be not repeatable.
			InvalidItems = m_checksDataSource.GetParameterValue("NonRepeatableWords");

			TextType prevTextType = TextType.Other;
			m_repeatedWords = new List<TextTokenSubstring>();
			ProcessRepeatedWords bodyProcessor =
				new ProcessRepeatedWords(characterCategorizer, m_repeatedWords, desiredKey);
			ProcessRepeatedWords noteProcessor =
				new ProcessRepeatedWords(characterCategorizer, m_repeatedWords, desiredKey);

			foreach (ITextToken tok in tokens)
			{
				if (tok.IsParagraphStart)
				{
					noteProcessor.Reset();
					bodyProcessor.Reset();
				}

				if (tok.TextType == TextType.Note)
				{
					if (tok.IsNoteStart)
						noteProcessor.Reset();
					noteProcessor.ProcessToken(tok);
				}

				// When we leave a caption, we start over checking for repeated words.
				// A caption is a start of a paragraph, so we already start over
				// when we encounter a picture caption.
				if (prevTextType == TextType.PictureCaption)
					noteProcessor.Reset();

				if (tok.TextType == TextType.Verse || tok.TextType == TextType.Other)
				{
					noteProcessor.Reset();
					bodyProcessor.ProcessToken(tok);
				}

				if (tok.TextType == TextType.ChapterNumber)
					bodyProcessor.Reset();

				prevTextType = tok.TextType;
			}

			return m_repeatedWords;
		}
		/// ------------------------------------------------------------------------------------
		/// <summary>
		///
		/// </summary>
		/// ------------------------------------------------------------------------------------
		public List<TextTokenSubstring> GetReferences(IEnumerable<ITextToken> tokens, string desiredKey)
		{
#if DEBUG
			List<ITextToken> AllTokens = new List<ITextToken>(tokens);
#endif
			m_characterCategorizer = m_checksDataSource.CharacterCategorizer;
			ValidItems = m_checksDataSource.GetParameterValue(kValidItemsParameter);
			InvalidItems = m_checksDataSource.GetParameterValue(kInvalidItemsParameter);

			string preferredLocale =
				m_checksDataSource.GetParameterValue("PreferredLocale") ?? string.Empty;

			string poeticStyles =
				m_checksDataSource.GetParameterValue("PoeticStyles");

			string introductionOutlineStyles =
				m_checksDataSource.GetParameterValue("IntroductionOutlineStyles");

			MatchedPairList pairList =
				MatchedPairList.Load(m_checksDataSource.GetParameterValue("MatchedPairs"),
				m_checksDataSource.GetParameterValue("DefaultWritingSystemName"));

			StyleCategorizer styleCategorizer =
				new StyleCategorizer(poeticStyles, introductionOutlineStyles);

			ProcessMatchedPairTokens bodyProcessor = new ProcessMatchedPairTokens(
				m_checksDataSource, pairList, styleCategorizer);

			ProcessMatchedPairTokens noteProcessor = new ProcessMatchedPairTokens(
				m_checksDataSource, pairList, styleCategorizer);

			m_unmatchedPairs = new List<TextTokenSubstring>();

			foreach (ITextToken tok in tokens)
			{
				if (tok.Text == null || (tok.Locale ?? string.Empty) != preferredLocale)
					continue;

				if (tok.TextType == TextType.Note)
				{
					// if a new note is starting finalize any sequences from the previous note
					if (tok.IsNoteStart)
						noteProcessor.FinalizeResult(desiredKey, m_unmatchedPairs);
					noteProcessor.ProcessToken(tok, desiredKey, m_unmatchedPairs);
				}
				else if (tok.TextType == TextType.Verse || tok.TextType == TextType.Other || tok.IsParagraphStart)
				{
					// body text: finalize any note that was in progress and continue with body text
					noteProcessor.FinalizeResult(desiredKey, m_unmatchedPairs);
					bodyProcessor.ProcessToken(tok, desiredKey, m_unmatchedPairs);
				}
			}

			noteProcessor.FinalizeResult(desiredKey, m_unmatchedPairs);
			bodyProcessor.FinalizeResult(desiredKey, m_unmatchedPairs);

			return m_unmatchedPairs;
		}
Exemple #21
0
		/// ------------------------------------------------------------------------------------
		/// <summary>
		/// Initializes a new instance of the <see cref="QuotationRelatedTokensProcessor"/> class.
		/// </summary>
		/// <param name="source">The checks data source.</param>
		/// <param name="charCategorizer">The character categorizer.</param>
		/// <param name="qmCategorizer">The quotation mark categorizer.</param>
		/// <param name="desiredKey">The desired key (can be string.Empty).</param>
		/// <param name="results">The result.</param>
		/// ------------------------------------------------------------------------------------
		internal QTokenProcessor(IChecksDataSource dataSource,
			CharacterCategorizer charCategorizer, QuotationMarkCategorizer qmCategorizer,
			string desiredKey, List<TextTokenSubstring> results)
		{
			m_chkDataSource = dataSource;
			m_charCategorizer = charCategorizer;
			m_qmCategorizer = qmCategorizer;
			m_desiredKey = desiredKey;
			m_results = results;
			m_verboseQuotes = (m_chkDataSource.GetParameterValue("VerboseQuotes") == "Yes");
			m_noCloserMsg = Localize("Unmatched opening mark: level {0}");
			m_noOpenerMsg = Localize("Unmatched closing mark: level {0}");
			m_regExQuotes = new Regex(qmCategorizer.Pattern);

			m_regExNonQuotes = new Regex(string.Format("[^{0}|\\s]",
				qmCategorizer.Pattern.Replace("]", "\\]"))); // Make sure brackets are escaped
		}
		public ProcessRepeatedWords(CharacterCategorizer characterCategorizer,
			List<TextTokenSubstring> result, string desiredKey)
		{
			this.characterCategorizer = characterCategorizer;
			this.result = result;
			this.desiredKey = desiredKey.ToLower();
		}
		/// ------------------------------------------------------------------------------------
		/// <summary>
		/// Get all instances of the item being checked in the token list passed.
		/// This includes both valid and invalid instances.
		/// This is used 1) to create an inventory of these items.
		/// To show the user all instance of an item with a specified key.
		/// 2) With a "desiredKey" in order to fetch instance of a specific
		/// item (e.g. all the places where "the" is a repeated word.
		/// </summary>
		/// <param name="tokens">Tokens for text to be scanned</param>
		/// <param name="desiredKey">If you only want instance of a specific key (e.g. one word,
		/// one punctuation pattern, one character, etc.) place it here. Empty string returns
		/// all items.</param>
		/// <returns>List of token substrings</returns>
		/// ------------------------------------------------------------------------------------
		public List<TextTokenSubstring> GetReferences(IEnumerable<ITextToken> tokens, string desiredKey)
		{
#if DEBUG
			List<ITextToken> AllTokens = new List<ITextToken>(tokens);
#endif
			m_characterCategorizer = m_checksDataSource.CharacterCategorizer;
			ValidItems = m_checksDataSource.GetParameterValue(kValidItemsParameter);
			InvalidItems = m_checksDataSource.GetParameterValue(kInvalidItemsParameter);

			string preferredLocale =
				m_checksDataSource.GetParameterValue("PreferredLocale") ?? string.Empty;

			m_mixedCapitalization = new List<TextTokenSubstring>();
			ProcessMixedCapitalization processor =
				new ProcessMixedCapitalization(m_checksDataSource, m_mixedCapitalization);

			foreach (ITextToken tok in tokens)
			{
				if ((tok.Locale ?? string.Empty) != preferredLocale)
					continue;

				foreach (WordAndPunct wap in m_characterCategorizer.WordAndPuncts(tok.Text))
					processor.ProcessWord(tok, wap, desiredKey);
			}

			return m_mixedCapitalization;
		}