/// ------------------------------------------------------------------------------------ /// <summary> /// Gets the length of the character including any associated diacritics that follow /// the base character. /// </summary> /// <param name="tok">The text token.</param> /// <param name="iBaseCharacter">The index of the base character in the text token.</param> /// <returns>length of the character, including all following diacritics</returns> /// ------------------------------------------------------------------------------------ private int GetLengthOfChar(ITextToken tok, int iBaseCharacter) { int charLength = 1; int iChar = iBaseCharacter + 1; while (iChar < tok.Text.Length && m_categorizer.IsDiacritic(tok.Text[iChar++])) { charLength++; } return(charLength); }
/// ------------------------------------------------------------------------------------ /// <summary> /// Records the paragraph style. /// </summary> /// <param name="tok">The Scripture token.</param> /// ------------------------------------------------------------------------------------ private void RecordParagraphStyle(ITextToken tok) { if (tok.IsParagraphStart) { m_paragraphStyle = tok.ParaStyleName; m_foundParagraphText = false; if (m_processParagraphsSeparately) { m_fAtSentenceStart = false; } } }
/// ------------------------------------------------------------------------------------ /// <summary> /// Initializes a new instance of the <see cref="TextTokenSubstring"/> class with a /// single source token. /// </summary> /// <param name="token">The token.</param> /// <param name="offset">The offset.</param> /// <param name="length">The length.</param> /// <param name="msg">The error message.</param> /// ------------------------------------------------------------------------------------ public TextTokenSubstring(ITextToken token, int offset, int length, string msg) /*: this(new List<ITextToken>(new[] { token }), offset, length, msg)*/ { if (offset < 0) throw new ArgumentOutOfRangeException("offset", "Offset must be 0 or greater."); if (offset > token.Text.Length) throw new ArgumentOutOfRangeException("offset"); if (length < 0) throw new ArgumentOutOfRangeException("length", "Length must be 0 or greater."); if (offset + length > token.Text.Length) throw new ArgumentOutOfRangeException("length"); m_tokens = new List<ITextToken>(new ITextToken[] { token }); m_offset = offset; m_length = length; m_message = msg; }
/// ------------------------------------------------------------------------------------ /// <summary> /// Processes the Scripture token. /// </summary> /// <param name="tok">The token.</param> /// <param name="result">The result.</param> /// ------------------------------------------------------------------------------------ public void ProcessToken(ITextToken tok, List <TextTokenSubstring> result) { string tokenText = RemoveAbbreviations(tok); RecordParagraphStyle(tok); RecordCharacterStyle(tok); // must be at least one character in token to check the case of if (tok.Text == String.Empty) { return; } for (int iChar = 0; iChar < tokenText.Length; iChar++) { char ch = tokenText[iChar]; if (IsSentenceFinalPunctuation(ch)) { m_fAtSentenceStart = iChar + 1 == tokenText.Length || (iChar + 1 < tokenText.Length && !char.IsDigit(tokenText[iChar + 1])); continue; } if (!m_categorizer.IsWordFormingCharacter(ch)) { continue; } if (m_categorizer.IsLower(ch)) { TextTokenSubstring tts = GetSubstring(tok, iChar); if (!CheckForParaCapitalizationError(tok, tts, result) && !CheckForCharStyleCapilizationError(tok, tts, result) && m_fAtSentenceStart) { tts.Message = CapitalizationCheck.GetErrorMessage(m_checksDataSource, StyleCapInfo.CapCheckTypes.SentenceInitial, string.Empty); result.Add(tts); } } m_fAtSentenceStart = false; m_foundCharacterText = true; m_foundParagraphText = true; } }
/// ------------------------------------------------------------------------------------ /// <summary> /// Checks for missing verses in the current chapter. /// </summary> /// ------------------------------------------------------------------------------------ private void CheckForMissingVerses(ITextToken[] versesFound, int bookId, int chapNumber) { ITextToken prevToken = versesFound[0]; for (int verse = 1; verse < versesFound.Length; verse++) { if (versesFound[verse] != null) { prevToken = versesFound[verse]; continue; } // At this point, we know we've found a missing verse. Now we need // to determine whether or not this is the first verse in a range // of missing verses or just a single missing verse. int startVerse = verse; int endVerse = verse; while (endVerse < versesFound.Length - 1 && versesFound[endVerse + 1] == null) { endVerse++; } prevToken.MissingStartRef = new BCVRef(bookId, chapNumber, startVerse); // If previous token is a verse token and it's verse 1 that's missing, // then we know we're dealing with the case of a missing chapter token // and a missing verse 1 token in that chapter. In that case, we want // the offset to fall just before the verse of the token (which is the // first verse token we found in the chapter and which we're assuming // is associated with a verse that would come after verse 1). int offset = (prevToken.TextType == TextType.VerseNumber && verse == 1 ? 0 : prevToken.Text.Length); if (startVerse == endVerse) { AddError(prevToken, offset, 0, Localize("Missing verse number {0}"), startVerse); } else { prevToken.MissingEndRef = new BCVRef(bookId, chapNumber, endVerse); AddError(prevToken, offset, 0, Localize("Missing verse numbers {0}-{1}"), startVerse, endVerse); } verse = endVerse; } }
/// ------------------------------------------------------------------------------------ /// <summary> /// Extract the punctuation sequences from this token /// </summary> /// <param name="tok"></param> /// <param name="desiredKey"></param> /// <param name="result"></param> /// ------------------------------------------------------------------------------------ public void ProcessToken(ITextToken tok, string desiredKey, List <TextTokenSubstring> result) { if (tok.IsParagraphStart || m_fTreatAsParagraphStart) { ProcessWhitespaceOrParagraph(true); m_fTreatAsParagraphStart = false; } // for each character in token for (int i = 0; i < tok.Text.Length; ++i) { char cc = tok.Text[i]; if (m_categorizer.IsPunctuation(cc)) { ProcessPunctuation(tok, i); } else if (char.IsDigit(cc)) { // If the previous finalized was done with a number, // and we have a single punctuation mark // followed by another number, ignore this sequence, // e.g. 3:14 if (m_finalizedWithNumber && m_puncts.Count == 1 && m_puncts[0].TokenType == PunctuationTokenType.punctuation) { m_puncts.Clear(); } else { ProcessDigit(tok, i); FinalizeResult(desiredKey, result, false); } } else if (char.IsWhiteSpace(cc)) { ProcessWhitespaceOrParagraph(false); } else { // if not punctuation, whitespace, or digit; it must be the start of a new word // therefore finalize any open punctuation sequence FinalizeResult(desiredKey, result, false); } } }
/// ------------------------------------------------------------------------------------ /// <summary> /// Checks for missing chapters in the current book. /// </summary> /// ------------------------------------------------------------------------------------ private void CheckForMissingChapters(bool[] chaptersFound) { for (int chap = 1; chap < chaptersFound.Length; chap++) { if (chaptersFound[chap] || (m_nChapterToCheck != 0 && chap != m_nChapterToCheck)) { continue; } // Find the first chapter token that immediately precedes where the // missing chapter would have a token if it weren't missing. ChapterToken precedingChapter = null; foreach (ChapterToken chapToken in m_chapTokens) { if (chapToken.ChapterNumber > chap) { break; } precedingChapter = chapToken; } // TODO: Deal with what token to use if a book has no chapters at all. // This should always succeed int offset = 0; ITextToken token = null; if (precedingChapter != null) { token = precedingChapter.Token; offset = precedingChapter.Implicit ? 0 : token.Text.Length; } else if (m_chapTokens.Count > 0) { token = m_chapTokens[0].Token; } if (token != null) { BCVRef scrRefStart = new BCVRef(BCVRef.BookToNumber(token.ScrRefString), chap, 0); token.MissingStartRef = scrRefStart; token.MissingEndRef = null; AddError(token, offset, 0, Localize("Missing chapter number {0}"), chap); } } }
/// ------------------------------------------------------------------------------------ /// <summary> /// /// </summary> /// ------------------------------------------------------------------------------------ public void ProcessToken(ITextToken tok, string desiredKey, List <TextTokenSubstring> result) { if (AnyFoundPairsClosedByPara && tok.IsParagraphStart && !m_styleCategorizer.IsPoeticStyle(tok.ParaStyleName)) { FinalizeResult(desiredKey, result); } for (int i = 0; i < tok.Text.Length; i++) { string cc = tok.Text.Substring(i, 1); if (m_pairList.BelongsToPair(cc)) { StoreFoundPairToken(tok, i); RemoveMatchedPunctAtEndOfFirstWordInIntroOutline(tok, i); RemoveIfMatchedPairFound(); RecordOverlappingPairs(); } } }
/// ------------------------------------------------------------------------------------ /// <summary> /// Removes the abbreviations from a Scripture token. /// </summary> /// <param name="tok">The Scripture token.</param> /// <returns>Scripture token with any abbreviations replaced with spaces.</returns> /// ------------------------------------------------------------------------------------ private string RemoveAbbreviations(ITextToken tok) { string tokenText = tok.Text; foreach (string abbreviation in m_abbreviations) { if (abbreviation == "") { continue; } string spaces = new string(' ', abbreviation.Length); tokenText = tokenText.Replace(abbreviation, spaces); } Debug.Assert(tok.Text.Length == tokenText.Length, "Length of text should not change", "Abbreviations are replaced by spaces, but the overall text length should stay the same."); return(tokenText); }
/// ------------------------------------------------------------------------------------ /// <summary> /// /// </summary> /// ------------------------------------------------------------------------------------ internal ChapterToken(ITextToken token, Regex chapterNumberFormat) { Token = token; m_chapNumber = 0; if (!chapterNumberFormat.IsMatch(Token.Text)) { Valid = false; } foreach (char ch in token.Text) { if (Char.IsDigit(ch)) { m_chapNumber *= 10; m_chapNumber += (int)Char.GetNumericValue(ch); } else { Valid = false; m_chapNumber = -1; break; } } Implicit = false; }
/// ------------------------------------------------------------------------------------ /// <summary> /// Initializes a new instance of the <see cref="TextTokenSubstring"/> class with a /// single source token. /// </summary> /// <param name="token">The token.</param> /// <param name="offset">The offset.</param> /// <param name="length">The length.</param> /// <param name="msg">The error message.</param> /// ------------------------------------------------------------------------------------ public TextTokenSubstring(ITextToken token, int offset, int length, string msg) /*: * this(new List<ITextToken>(new[] { token }), offset, length, msg)*/ { if (offset < 0) { throw new ArgumentOutOfRangeException("offset", "Offset must be 0 or greater."); } if (offset > token.Text.Length) { throw new ArgumentOutOfRangeException("offset"); } if (length < 0) { throw new ArgumentOutOfRangeException("length", "Length must be 0 or greater."); } if (offset + length > token.Text.Length) { throw new ArgumentOutOfRangeException("length"); } m_tokens = new List <ITextToken>(new ITextToken[] { token }); m_offset = offset; m_length = length; m_message = msg; }
/// ------------------------------------------------------------------------------------ /// <summary> /// Checks the given tokens for chapter/verse errors and calls the given RecordError /// handler for each one. /// </summary> /// <param name="toks">The tokens to check.</param> /// <param name="record">Method to call to record errors.</param> /// ------------------------------------------------------------------------------------ public void Check(IEnumerable <ITextToken> toks, RecordErrorHandler record) { GetParameters(); m_recordError = record; m_versesFound = new List <int>(); m_chapTokens.Clear(); ChapterToken currChapterToken = null; VerseToken currVerseToken = null; foreach (ITextToken token in toks) { // This token is only necessary when a chapter one is missing // and we need a token to use for reporting that it's missing. if (m_fallbackToken == null) { m_fallbackToken = token; } if (token.TextType == TextType.ChapterNumber) { currChapterToken = new ChapterToken(token, m_chapterNumberFormat); currVerseToken = null; m_chapTokens.Add(currChapterToken); } else if (token.TextType == TextType.VerseNumber) { if (currChapterToken == null) { //assume chapter one currChapterToken = new ChapterToken(token, 1); m_chapTokens.Add(currChapterToken); } currVerseToken = new VerseToken(token); currChapterToken.VerseTokens.Add(currVerseToken); } else if (token.TextType == TextType.Verse) { if (currChapterToken == null) { // no chapter token and no verse number token // oh no! use verse text token as default, but system // should error on missing verse first. if (currVerseToken == null) { //assume chapter one currChapterToken = new ChapterToken(token, 1); m_chapTokens.Add(currChapterToken); //assume verse one currVerseToken = new VerseToken(token, 1); currChapterToken.VerseTokens.Add(currVerseToken); } // no chapter token, but we have verse number token // then use the verse number token else { // this case should not happen because chapter tokens // are automatically created if a verse number token is // encountered first Debug.Assert(false, "verse number token found without chapter number token"); } } else { // we have a chapter token, but no verse number token // use the chapter token as the default token. if (currVerseToken == null) { //assume verse one currVerseToken = new VerseToken(token, 1); currChapterToken.VerseTokens.Add(currVerseToken); } // we have a chapter token, and a verse number token // we are happy else { // do nothing } } currVerseToken.IncrementVerseTextCount(token); } } CheckChapterNumbers(); }
private void AddWord(ITextToken tok, WordAndPunct wap) { TextTokenSubstring tts = new TextTokenSubstring(tok, wap.Offset, wap.Word.Length); if (desiredKey == "" || desiredKey == tts.InventoryText) result.Add(tts); }
/// ------------------------------------------------------------------------------------ /// <summary> /// Initializes a new instance of the <see cref="TextTokenSubstring"/> class with a /// single source token. /// </summary> /// <param name="token">The token.</param> /// <param name="offset">The offset.</param> /// <param name="length">The length.</param> /// ------------------------------------------------------------------------------------ public TextTokenSubstring(ITextToken token, int offset, int length) : this(token, offset, length, null) { }
/// ------------------------------------------------------------------------------------ /// <summary> /// Creates a checking error if character style requires an initial uppercase letter, /// but the tssFirstLetter is lowercase. /// </summary> /// <param name="tok">The Scripture token.</param> /// <param name="ttsFirstLetter">The token substring of the first word-forming character /// in the given token.</param> /// <param name="result">The result.</param> /// <returns><c>true</c> if an error was added to the list of results; otherwise /// <c>false</c></returns> /// ------------------------------------------------------------------------------------ private bool CheckForCharStyleCapilizationError(ITextToken tok, TextTokenSubstring ttsFirstLetter, List<TextTokenSubstring> result) { if (m_foundCharacterText) return false; m_foundCharacterText = true; // The first word-forming character of the character style is lowercase. // Look it up in the capitalized styles dictionary to determine if it should be uppercase. StyleCapInfo styleCapInfo; if (m_allCapitalizedStyles.TryGetValue(m_characterStyle, out styleCapInfo) && styleCapInfo.m_type == StyleInfo.StyleTypes.character) { ttsFirstLetter.InventoryText = m_characterStyle; ttsFirstLetter.Message = CapitalizationCheck.GetErrorMessage(m_checksDataSource, styleCapInfo.m_capCheck, m_characterStyle); result.Add(ttsFirstLetter); return true; } return false; }
public void ProcessToken(ITextToken tok) { foreach (WordAndPunct wap in characterCategorizer.WordAndPuncts(tok.Text)) ProcessWord(tok, wap); }
/// ------------------------------------------------------------------------------------ /// <summary> /// Adds an error for a missing chapter number. /// </summary> /// ------------------------------------------------------------------------------------ private void AddMissingChapterError(ITextToken token, int missingChapter, int offset) { BCVRef scrRef = new BCVRef(token.ScrRefString); scrRef.Chapter = missingChapter; scrRef.Verse = 0; token.MissingStartRef = scrRef; AddError(token, offset, 0, Localize("Missing chapter number {0}"), missingChapter); }
/// ------------------------------------------------------------------------------------ /// <summary> /// /// </summary> /// ------------------------------------------------------------------------------------ internal ChapterToken(ITextToken token, int chapNumber) { Token = token; m_chapNumber = chapNumber; Implicit = true; }
/// ------------------------------------------------------------------------------------ /// <summary> /// /// </summary> /// ------------------------------------------------------------------------------------ internal VerseToken(ITextToken verseNumber) { m_verseNumberToken = verseNumber; }
/// ------------------------------------------------------------------------------------ /// <summary> /// Check verse numbers. /// </summary> /// ------------------------------------------------------------------------------------ private void CheckVerseNumbers(ChapterToken chapToken, int bookId) { int lastVrsInChap = m_versification.LastVerse(bookId, chapToken.ChapterNumber); int nextExpectedVerse = 1; bool expectingPartB = false; int prevVerseStart = 0; int prevVerseEnd = 0; ITextToken[] versesFound = new ITextToken[lastVrsInChap + 1]; versesFound[0] = chapToken.Token; foreach (VerseToken verseToken in chapToken.VerseTokens) { ITextToken token = verseToken.VerseNumber; ITextToken reportedToken = token; string msg = null; int offset = 0; int length = token.Text.Length; object[] errorArgs = null; bool countFoundVerses = false; int curVerseStart; int curVerseEnd; VersePart vrsPart; if (verseToken.ImplicitVerseNumber == 1) { versesFound[1] = token; continue; } ParseVerseResult parseResult = ParseVerseNumber(token.Text, out curVerseStart, out curVerseEnd, out vrsPart); if (parseResult == ParseVerseResult.ValidWithSpaceInVerse) { // Log error telling user there are spaces before or after the verse // number. This means the space(s) have the verse number style. This isn't // considered an invalid verse number, but we do need to tell the user. AddError(token, 0, token.Text.Length, Localize("Space found in verse number"), token.Text); } else if (parseResult == ParseVerseResult.ValidWithSpaceInVerseBridge) { // Log error telling user there are spaces in a verse bridge. This // means the space(s) have the verse number style. This isn't considered // an invalid verse number, but we do need to tell the user. AddError(token, 0, token.Text.Length, Localize("Space found in verse bridge"), token.Text); } if (parseResult == ParseVerseResult.Invalid) { msg = Localize("Invalid verse number"); } else if ((parseResult != ParseVerseResult.InvalidFormat) && VersesAlreadyFound(curVerseStart, curVerseEnd, versesFound) && !(expectingPartB && vrsPart == VersePart.PartB)) { if (AnyOverlappingVerses(curVerseStart, curVerseEnd, prevVerseStart, prevVerseEnd, out errorArgs)) { // Duplicate verse(s) found. msg = (errorArgs.Length == 1 ? Localize("Duplicate verse number") : Localize("Duplicate verse numbers")); } else { // Verse number(s) are unexpected msg = (curVerseStart == curVerseEnd ? Localize("Unexpected verse number") : Localize("Unexpected verse numbers")); } } else if (AnyOverlappingVerses(curVerseStart, curVerseEnd, lastVrsInChap + 1, int.MaxValue, out errorArgs)) { countFoundVerses = true; // Start and/or end verse is out of range msg = (errorArgs.Length == 1 ? Localize("Verse number out of range") : Localize("Verse numbers out of range")); } else if (curVerseStart < nextExpectedVerse) { // Verse number(s) are out of order countFoundVerses = true; if (nextExpectedVerse <= lastVrsInChap) { errorArgs = new object[] { nextExpectedVerse }; msg = (curVerseStart == curVerseEnd ? Localize("Verse number out of order; expected verse {0}") : Localize("Verse numbers out of order; expected verse {0}")); } else { msg = (curVerseStart == curVerseEnd ? Localize("Verse number out of order") : Localize("Verse numbers out of order")); } } else if (((vrsPart == VersePart.PartB) != expectingPartB) && (curVerseStart == curVerseEnd)) { // Missing part A or B // TODO: cover cases like "4a 5-7" and "4 5b-7". This would require // ParseVerseNumber() to detect verse parts at the beginning of bridges. reportedToken = (vrsPart == VersePart.PartB ? token : versesFound[prevVerseEnd]); msg = Localize("Missing verse number {0}"); offset = (vrsPart == VersePart.PartB ? 0 : reportedToken.Text.Length); length = 0; int reportedVrsNum = (vrsPart == VersePart.PartB ? curVerseStart : prevVerseEnd); string fmt = (vrsPart == VersePart.PartB ? "{0}a" : "{0}b"); errorArgs = new object[] { string.Format(fmt, reportedVrsNum) }; countFoundVerses = true; } else if ((vrsPart == VersePart.PartB && curVerseStart > prevVerseEnd) && (curVerseStart == curVerseEnd)) { // Missing both a part B and A reportedToken = versesFound[prevVerseEnd]; AddError(reportedToken, reportedToken.Text.Length, 0, Localize("Missing verse number {0}"), new object[] { string.Format("{0}b", prevVerseEnd) }); AddError(token, 0, 0, Localize("Missing verse number {0}"), new object[] { string.Format("{0}a", curVerseStart) }); } if (msg != null) { // Report the error found. if (errorArgs == null) { AddError(reportedToken, offset, length, msg); } else { AddError(reportedToken, offset, length, msg, errorArgs); } } if (msg == null || countFoundVerses) { // No error was found for the current verse range so set all the verses // in our found verse list corresponding to those in the range. for (int i = curVerseStart; i <= Math.Min(curVerseEnd, lastVrsInChap); i++) { versesFound[i] = token; } } if (parseResult == ParseVerseResult.InvalidFormat) { AddError(token, 0, token.Text.Length, Localize("Invalid verse number"), token.Text); } // only worry about this if the chapter and/or verse tokens are in order if (verseToken.VerseTextCount < 1) { AddError(verseToken.VerseNumber, 0, verseToken.VerseNumber.Text.Length, Localize("Missing verse text in verse {0}"), verseToken.VerseNumber.Text); } // Determine next expected verse. // Don't expect a partB if there was an error with partA expectingPartB = (vrsPart == VersePart.PartA && msg == null); if (!expectingPartB && curVerseEnd <= lastVrsInChap) { nextExpectedVerse = curVerseEnd + 1; } prevVerseStart = curVerseStart; prevVerseEnd = curVerseEnd; } CheckForMissingVerses(versesFound, bookId, chapToken.ChapterNumber); }
/// ------------------------------------------------------------------------------------ /// <summary> /// Increment verse text count /// </summary> /// ------------------------------------------------------------------------------------ internal void IncrementVerseTextCount(ITextToken token) { // only count tokens that aren't all whitespace. if (token.Text.Trim().Length > 0) m_nbrTextTokens++; }
/// ------------------------------------------------------------------------------------ /// <summary> /// /// </summary> /// ------------------------------------------------------------------------------------ internal VerseToken(ITextToken implicitVerseNumber, int verseNumber) { m_verseNumberToken = implicitVerseNumber; m_implicitVerseNumber = verseNumber; }
/// ------------------------------------------------------------------------------------ /// <summary> /// /// </summary> /// ------------------------------------------------------------------------------------ internal VerseToken( ITextToken verseNumber) { m_verseNumberToken = verseNumber; }
/// ------------------------------------------------------------------------------------ /// <summary> /// Checks for missing chapters. /// </summary> /// ------------------------------------------------------------------------------------ private void CheckChapterNumbers() { int bookId = BCVRef.BookToNumber(m_sBookId); int lastChapInBook = m_versification.LastChapter(bookId); int nextExpectedChapter = 1; int prevChapNumber = 0; bool[] chaptersFound = new bool[lastChapInBook + 1]; foreach (ChapterToken chapToken in m_chapTokens) { if (m_nChapterToCheck != 0 && chapToken.ChapterNumber != m_nChapterToCheck) { continue; } string msg = null; int errorArg = chapToken.ChapterNumber; ITextToken token = chapToken.Token; if (!chapToken.Valid) { // Chapter number is invalid AddError(token, 0, token.Text.Length, Localize("Invalid chapter number"), errorArg); } if (chapToken.ChapterNumber >= 1) { if (chapToken.ChapterNumber > lastChapInBook) { // Chapter number is out of range msg = Localize("Chapter number out of range"); } else if (chapToken.ChapterNumber == prevChapNumber) { // Chapter number is repeated msg = Localize("Duplicate chapter number"); } else if (chapToken.ChapterNumber < nextExpectedChapter) { // Chapter number is out of order msg = Localize("Chapter out of order; expected chapter {0}"); errorArg = nextExpectedChapter; } if (msg != null) { AddError(token, 0, token.Text.Length, msg, errorArg); } else { chaptersFound[chapToken.ChapterNumber] = true; CheckVerseNumbers(chapToken, bookId); } } prevChapNumber = chapToken.ChapterNumber; nextExpectedChapter = Math.Max(chapToken.ChapterNumber + 1, nextExpectedChapter); } CheckForMissingChapters(chaptersFound); }
/// ------------------------------------------------------------------------------------ /// <summary> /// Checks for missing verses in the current chapter. /// </summary> /// ------------------------------------------------------------------------------------ private void CheckForMissingVerses(ITextToken[] versesFound, int bookId, int chapNumber) { ITextToken prevToken = versesFound[0]; for (int verse = 1; verse < versesFound.Length; verse++) { if (versesFound[verse] != null) { prevToken = versesFound[verse]; continue; } // At this point, we know we've found a missing verse. Now we need // to determine whether or not this is the first verse in a range // of missing verses or just a single missing verse. int startVerse = verse; int endVerse = verse; while (endVerse < versesFound.Length - 1 && versesFound[endVerse + 1] == null) endVerse++; prevToken.MissingStartRef = new BCVRef(bookId, chapNumber, startVerse); // If previous token is a verse token and it's verse 1 that's missing, // then we know we're dealing with the case of a missing chapter token // and a missing verse 1 token in that chapter. In that case, we want // the offset to fall just before the verse of the token (which is the // first verse token we found in the chapter and which we're assuming // is associated with a verse that would come after verse 1). int offset = (prevToken.TextType == TextType.VerseNumber && verse == 1 ? 0 : prevToken.Text.Length); if (startVerse == endVerse) AddError(prevToken, offset, 0, Localize("Missing verse number {0}"), startVerse); else { prevToken.MissingEndRef = new BCVRef(bookId, chapNumber, endVerse); AddError(prevToken, offset, 0, Localize("Missing verse numbers {0}-{1}"), startVerse, endVerse); } verse = endVerse; } }
/// ------------------------------------------------------------------------------------ /// <summary> /// Records an error. /// </summary> /// <param name="token">The current token being processed.</param> /// <param name="offset">Offset in the token where the offending text begins.</param> /// <param name="length">The length of the offending text.</param> /// <param name="message">The message.</param> /// <param name="args">The arguments to format the message.</param> /// ------------------------------------------------------------------------------------ private void AddError(ITextToken token, int offset, int length, string message, params object[] args) { string formattedMsg = (args != null) ? string.Format(message, args) : String.Format(message); TextTokenSubstring tts = new TextTokenSubstring(token, offset, length, formattedMsg); m_recordError(new RecordErrorEventArgs(tts, CheckId)); }
/// ------------------------------------------------------------------------------------ /// <summary> /// /// </summary> /// ------------------------------------------------------------------------------------ internal ChapterToken(ITextToken token, Regex chapterNumberFormat) { Token = token; m_chapNumber = 0; if (!chapterNumberFormat.IsMatch(Token.Text)) Valid = false; foreach (char ch in token.Text) { if (Char.IsDigit(ch)) { m_chapNumber *= 10; m_chapNumber += (int) Char.GetNumericValue(ch); } else { Valid = false; m_chapNumber = -1; break; } } Implicit = false; }
/// ------------------------------------------------------------------------------------ /// <summary> /// Gets the substring for the character starting at position iChar. /// </summary> /// <param name="tok">The token</param> /// <param name="iChar">The index of the character.</param> /// ------------------------------------------------------------------------------------ private TextTokenSubstring GetSubstring(ITextToken tok, int iChar) { int iCharLength = GetLengthOfChar(tok, iChar); TextTokenSubstring tts = new TextTokenSubstring((tok is VerseTextToken ? ((VerseTextToken)tok).Token : tok), iChar, iCharLength); return tts; }
/// ------------------------------------------------------------------------------------ /// <summary> /// Records the character style. /// </summary> /// <param name="tok">The Scripture token.</param> /// ------------------------------------------------------------------------------------ private void RecordCharacterStyle(ITextToken tok) { if (tok.CharStyleName != m_characterStyle) { m_characterStyle = tok.CharStyleName; m_foundCharacterText = false; } }
/// ------------------------------------------------------------------------------------ /// <summary> /// Checks the list of found verses to see if any verses in the specified range have /// already been found. /// </summary> /// ------------------------------------------------------------------------------------ private bool VersesAlreadyFound(int curVerseStart, int curVerseEnd, ITextToken[] versesFound) { for (int verse = curVerseStart; verse <= curVerseEnd; verse++) { if (verse < versesFound.Length && verse > 0 && versesFound[verse] != null) return true; } return false; }
/// ------------------------------------------------------------------------------------ /// <summary> /// /// </summary> /// <param name="tok"></param> /// <param name="wap"></param> /// <param name="desiredKey"></param> /// ------------------------------------------------------------------------------------ public void ProcessWord(ITextToken tok, WordAndPunct wap, string desiredKey) { AWord word = new AWord(wap.Word, m_categorizer); if (word.Prefix == string.Empty && word.Suffix == string.Empty) return; if (m_uncapitalizedPrefixes.Contains(word.Prefix)) return; if (m_uncapitalizedPrefixes.Contains("*" + word.Prefix[word.Prefix.Length - 1])) return; if (m_uncapitalizedPrefixes.Contains("*")) return; if (m_capitalizedSuffixes.Contains(word.Suffix)) return; if (m_capitalizedPrefixes.Contains(word.Prefix)) return; AddWord(tok, wap, desiredKey); }
/// ------------------------------------------------------------------------------------ /// <summary> /// Processes the Scripture token. /// </summary> /// <param name="tok">The token.</param> /// <param name="result">The result.</param> /// ------------------------------------------------------------------------------------ public void ProcessToken(ITextToken tok, List<TextTokenSubstring> result) { string tokenText = RemoveAbbreviations(tok); RecordParagraphStyle(tok); RecordCharacterStyle(tok); // must be at least one character in token to check the case of if (tok.Text == String.Empty) return; for (int iChar = 0; iChar < tokenText.Length; iChar++) { char ch = tokenText[iChar]; if (IsSentenceFinalPunctuation(ch)) { m_fAtSentenceStart = iChar + 1 == tokenText.Length || (iChar + 1 < tokenText.Length && !char.IsDigit(tokenText[iChar + 1])); continue; } if (!m_categorizer.IsWordFormingCharacter(ch)) continue; if (m_categorizer.IsLower(ch)) { TextTokenSubstring tts = GetSubstring(tok, iChar); if (!CheckForParaCapitalizationError(tok, tts, result) && !CheckForCharStyleCapilizationError(tok, tts, result) && m_fAtSentenceStart) { tts.Message = CapitalizationCheck.GetErrorMessage(m_checksDataSource, StyleCapInfo.CapCheckTypes.SentenceInitial, string.Empty); result.Add(tts); } } m_fAtSentenceStart = false; m_foundCharacterText = true; m_foundParagraphText = true; } }
/// ------------------------------------------------------------------------------------ /// <summary> /// /// </summary> /// <param name="tok"></param> /// <param name="wap"></param> /// <param name="desiredKey"></param> /// ------------------------------------------------------------------------------------ private void AddWord(ITextToken tok, WordAndPunct wap, string desiredKey) { TextTokenSubstring tts = new TextTokenSubstring(tok, wap.Offset, wap.Word.Length); if (String.IsNullOrEmpty(desiredKey) || desiredKey == tts.InventoryText) m_result.Add(tts); }
/// ------------------------------------------------------------------------------------ /// <summary> /// Records the paragraph style. /// </summary> /// <param name="tok">The Scripture token.</param> /// ------------------------------------------------------------------------------------ private void RecordParagraphStyle(ITextToken tok) { if (tok.IsParagraphStart) { m_paragraphStyle = tok.ParaStyleName; m_foundParagraphText = false; if (m_processParagraphsSeparately) m_fAtSentenceStart = false; } }
/// ------------------------------------------------------------------------------------ /// <summary> /// /// </summary> /// ------------------------------------------------------------------------------------ public void ProcessToken(ITextToken tok, string desiredKey, List<TextTokenSubstring> result) { if (AnyFoundPairsClosedByPara && tok.IsParagraphStart && !m_styleCategorizer.IsPoeticStyle(tok.ParaStyleName)) { FinalizeResult(desiredKey, result); } for (int i = 0; i < tok.Text.Length; i++) { string cc = tok.Text.Substring(i, 1); if (m_pairList.BelongsToPair(cc)) { StoreFoundPairToken(tok, i); RemoveMatchedPunctAtEndOfFirstWordInIntroOutline(tok, i); RemoveIfMatchedPairFound(); RecordOverlappingPairs(); } } }
/// ------------------------------------------------------------------------------------ /// <summary> /// Removes the abbreviations from a Scripture token. /// </summary> /// <param name="tok">The Scripture token.</param> /// <returns>Scripture token with any abbreviations replaced with spaces.</returns> /// ------------------------------------------------------------------------------------ private string RemoveAbbreviations(ITextToken tok) { string tokenText = tok.Text; foreach (string abbreviation in m_abbreviations) { if (abbreviation == "") continue; string spaces = new string(' ', abbreviation.Length); tokenText = tokenText.Replace(abbreviation, spaces); } Debug.Assert(tok.Text.Length == tokenText.Length, "Length of text should not change", "Abbreviations are replaced by spaces, but the overall text length should stay the same."); return tokenText; }
/// ------------------------------------------------------------------------------------ /// <summary> /// /// </summary> /// ------------------------------------------------------------------------------------ private void StoreFoundPairToken(ITextToken tok, int i) { TextTokenSubstring tts = new TextTokenSubstring(tok, i, 1); // Assign an initial, default message which may be changed later tts.Message = m_checksDataSource.GetLocalizedString("Unmatched punctuation"); m_pairTokensFound.Add(tts); }
/// ------------------------------------------------------------------------------------ /// <summary> /// Gets the length of the character including any associated diacritics that follow /// the base character. /// </summary> /// <param name="tok">The text token.</param> /// <param name="iBaseCharacter">The index of the base character in the text token.</param> /// <returns>length of the character, including all following diacritics</returns> /// ------------------------------------------------------------------------------------ private int GetLengthOfChar(ITextToken tok, int iBaseCharacter) { int charLength = 1; int iChar = iBaseCharacter + 1; while(iChar < tok.Text.Length && m_categorizer.IsDiacritic(tok.Text[iChar++])) charLength++; return charLength; }
/// ------------------------------------------------------------------------------------ /// <summary> /// /// </summary> /// ------------------------------------------------------------------------------------ private void RemoveMatchedPunctAtEndOfFirstWordInIntroOutline(ITextToken tok, int i) { if (!m_styleCategorizer.IsIntroductionOutlineStyle(tok.ParaStyleName)) return; // See if we are at the end of the first word string[] words = tok.Text.Split(); string firstWord = words[0]; if (i + 1 != firstWord.Length) return; int lastFoundPairToken = m_pairTokensFound.Count - 1; // If the current matched pair is in an introduction outline, // ends the first word, and is a closing punct, remove it. if (m_pairList.IsClose(m_pairTokensFound[lastFoundPairToken].Text)) m_pairTokensFound.RemoveAt(lastFoundPairToken); }
private void ProcessWord(ITextToken tok, WordAndPunct wap) { if (wap.Word == "") return; string nextWord = wap.Word.ToLower(); if (prevWord == nextWord) AddWord(tok, wap); prevWord = nextWord; // If there are characters (such as quotes) between words, // then two words are not considered repeating, even if they are identical foreach (char cc in wap.Punct) { if (!char.IsWhiteSpace(cc)) { Reset(); break; } } }
/// ------------------------------------------------------------------------------------ /// <summary> /// Extract the punctuation sequences from this token /// </summary> /// <param name="tok"></param> /// <param name="desiredKey"></param> /// <param name="result"></param> /// ------------------------------------------------------------------------------------ public void ProcessToken(ITextToken tok, string desiredKey, List<TextTokenSubstring> result) { if (tok.IsParagraphStart || m_fTreatAsParagraphStart) { ProcessWhitespaceOrParagraph(true); m_fTreatAsParagraphStart = false; } // for each character in token for (int i = 0; i < tok.Text.Length; ++i) { char cc = tok.Text[i]; if (m_categorizer.IsPunctuation(cc)) ProcessPunctuation(tok, i); else if (char.IsDigit(cc)) { // If the previous finalized was done with a number, // and we have a single punctuation mark // followed by another number, ignore this sequence, // e.g. 3:14 if (m_finalizedWithNumber && m_puncts.Count == 1 && m_puncts[0].TokenType == PunctuationTokenType.punctuation) { m_puncts.Clear(); } else { ProcessDigit(tok, i); FinalizeResult(desiredKey, result, false); } } else if (char.IsWhiteSpace(cc)) ProcessWhitespaceOrParagraph(false); else { // if not punctuation, whitespace, or digit; it must be the start of a new word // therefore finalize any open punctuation sequence FinalizeResult(desiredKey, result, false); } } }
/// ------------------------------------------------------------------------------------ /// <summary> /// Adds a token. /// </summary> /// ------------------------------------------------------------------------------------ public void AddToken(ITextToken token) { if (token.IsParagraphStart) throw new ArgumentException("A substring must be wholly contained within a single paragraph."); m_tokens.Add(token); }
/// ------------------------------------------------------------------------------------ /// <summary> /// Add punctuation to list /// </summary> /// <param name="tok">The text token</param> /// <param name="i">The index of the punctuation character</param> /// ------------------------------------------------------------------------------------ private void ProcessPunctuation(ITextToken tok, int i) { TextTokenSubstring tts = new TextTokenSubstring(tok, i, 1); bool isInitial = m_quotationCategorizer.IsInitialPunctuation(tts.Text); bool isFinal = m_quotationCategorizer.IsFinalPunctuation(tts.Text); m_puncts.Add(new PunctuationToken(PunctuationTokenType.punctuation, tts, isInitial, isFinal)); // special case: treat a sequence like // opening quotation punctuation/space/opening quotation punctuation // as if the space were not there. an example of this would be // U+201C LEFT DOUBLE QUOTATION MARK // U+0020 SPACE // U+2018 LEFT SINGLE QUOTATION MARK // this allows a quotation mark to be considered word initial even if it is followed by a space if (m_puncts.Count >= 3) { // If the last three tokens are punctuation/whitespace/punctuation if (m_puncts[m_puncts.Count - 2].TokenType == PunctuationTokenType.whitespace && !m_puncts[m_puncts.Count - 2].IsParaBreak && m_puncts[m_puncts.Count - 3].TokenType == PunctuationTokenType.punctuation) { // And both punctuation have quote directions which point in the same direction, if (m_puncts[m_puncts.Count - 3].IsInitial && m_puncts[m_puncts.Count - 1].IsInitial || m_puncts[m_puncts.Count - 3].IsFinal && m_puncts[m_puncts.Count - 1].IsFinal) { // THEN mark the whitespace as a quote separator. m_puncts[m_puncts.Count - 2].TokenType = PunctuationTokenType.quoteSeparator; } } } }
/// ------------------------------------------------------------------------------------ /// <summary> /// Add a number to the list /// </summary> /// <param name="tok"></param> /// <param name="i"></param> /// ------------------------------------------------------------------------------------ private void ProcessDigit(ITextToken tok, int i) { m_puncts.Add(new PunctuationToken(PunctuationTokenType.number, null, false, false)); #if UNUSED // special case: treat a sequence like // number/punctuation/number // as if the punctuation were not there. an example of this would be 1:2 // this allows the : in 1:2 not to be counted as punctuation if (tokens.Count >= 3) { // If the last three tokens are number/select punctuation/number if (tokens[tokens.Count - 3].TokenType == PunctuationTokenType.number) { string separator = tokens[tokens.Count - 2].ToString(); //! make the list of separator characters configurable if (separator == "," || separator == "." || separator == "-" || separator == ":") { tokens.RemoveAt(tokens.Count - 2); // The offset (-2) stays the same as the line of code above // since after the previous line is executed some of the tokens shift position. tokens.RemoveAt(tokens.Count - 2); } } } #endif }