This class allows tokenizing usfm text snippets without using any Paratext specific code. Markers supported: id, rem, s, p, q1, q2, io, f, f*, f?, x, x*, x?, nd, nd*
Inheritance: ITextToken
        /// ------------------------------------------------------------------------------------
        /// <summary>
        /// Clones this instance.
        /// </summary>
        /// ------------------------------------------------------------------------------------
        public ITextToken Clone()
        {
            UnitTestUSFMTextToken tok = new UnitTestUSFMTextToken();

            tok.ParaStyleName     = this.ParaStyleName;
            tok.CharStyleName     = this.CharStyleName;
            tok.BookText          = this.BookText;
            tok.Offset            = this.Offset;
            tok.Length            = this.Length;
            tok.Chapter           = this.Chapter;
            tok.Verse             = this.Verse;
            tok.IsPublishableText = this.IsPublishableText;
            tok.IsNoteText        = this.IsNoteText;
            tok.IsVerseText       = this.IsVerseText;

            return(tok);
        }
        // Create a new token. Set its Offset, BookText, StyleName.
        private UnitTestUSFMTextToken NextToken(string text, int ind)
        {
            // When this loop is done j points to the first character that is not
            // part of the marker. Note that the space in '\\p ' is considered part
            // of the marker (it terminates the marker). The space in '\\nd* ' is not
            // considered part of the marker.
            int    j;
            string marker = "";

            for (j = ind + 1; j < text.Length; ++j)
            {
                if (text[j] <= 32)
                {
                    marker = text.Substring(ind + 1, j - (ind + 1));
                    j      = j + 1;
                    if (j < text.Length && text[j] == '\n')
                    {
                        j = j + 1;
                    }
                    break;
                }
                if (text[j] == '*')
                {
                    j      = j + 1;
                    marker = text.Substring(ind + 1, j - (ind + 1));
                    break;
                }
            }

            UnitTestUSFMTextToken tok = new UnitTestUSFMTextToken();

            tok.Offset   = j;
            tok.BookText = text;
            if (IsParagraphStart(marker))
            {
                tok.ParaStyleName = marker;
            }
            else
            {
                tok.CharStyleName = marker;
            }

            return(tok);
        }
        /// <summary>
        /// Divide text for book into TextTokens.
        /// Set Offset, Length, BookText, AnnotationOffset, Chapter, Verse
        /// Tricky things needing done:
        /// 1) Split \v N abc... into two tokens, first containing just verse number
        /// 2) \f X abc... don't return caller as part of the token
        /// </summary>
        private List <UnitTestUSFMTextToken> DivideText(string text)
        {
            UnitTestUSFMTextToken        tok    = null;
            List <UnitTestUSFMTextToken> tokens = new List <UnitTestUSFMTextToken>();
            string chapter       = "1";
            string verse         = "0";
            bool   inPublishable = false;

            for (int i = 0; i < text.Length;)
            {
                int ind = text.IndexOf("\\", i);
                if (tok != null)                  // if token in progress, set its length
                {
                    int last = (ind == -1) ? text.Length : ind;
                    tok.Length = last - tok.Offset;
                }

                if (ind == -1)
                {
                    break;                             // quit if not more markers
                }
                tok = NextToken(text, ind);            // start new token

                if (tok.IsParagraphStart)
                {
                    inPublishable = tok.IsPublishable ||
                                    tok.IsChapter;
                }
                // sadly \c does not have publishable property set in usfm.sty

                if (inPublishable)
                {
                    tokens.Add(tok);
                }

                if (tok.IsChapter)
                {
                    chapter = GetCVNumber(text, tok.Offset);
                    // Everything after \c is verse '0'.
                    // This allows the title of Psalms (\d) which are present in the Hebrew
                    // text to be considered verse text.
                    verse = "0";
                }
                else if (tok.IsVerse)
                {
                    // Add a token with just the verse number
                    verse      = GetCVNumber(text, tok.Offset);
                    tok.Length = verse.Length;

                    // Make another token to contain the verse text
                    tok = tok.Clone() as UnitTestUSFMTextToken;
                    tok.CharStyleName = "";
                    tok.Offset       += verse.Length;
                    tokens.Add(tok);

                    // If number followed by a space, skip this
                    if (char.IsWhiteSpace(text[tok.Offset]))
                    {
                        tok.Offset += 1;
                    }
                }

                tok.Chapter = chapter;
                tok.Verse   = verse;

                if (tok.IsNoteStart)
                {
                    // Skip over the footnote caller
                    while (tok.Offset < text.Length)
                    {
                        char cc = text[tok.Offset];
                        if (cc == '\\')
                        {
                            break;
                        }
                        if (char.IsWhiteSpace(cc))
                        {
                            ++tok.Offset;
                            break;
                        }

                        ++tok.Offset;
                    }
                }

                i = tok.Offset;
            }

            return(tokens);
        }
示例#4
0
		// Create a new token. Set its Offset, BookText, StyleName.
		private UnitTestUSFMTextToken NextToken(string text, int ind)
		{
			// When this loop is done j points to the first character that is not
			// part of the marker. Note that the space in '\\p ' is considered part
			// of the marker (it terminates the marker). The space in '\\nd* ' is not
			// considered part of the marker.
			int j;
			string marker = "";
			for (j = ind + 1; j < text.Length; ++j)
			{
				if (text[j] <= 32)
				{
					marker = text.Substring(ind + 1, j - (ind + 1));
					j = j + 1;
					if (j < text.Length && text[j] == '\n')
						j = j + 1;
					break;
				}
				if (text[j] == '*')
				{
					j = j + 1;
					marker = text.Substring(ind + 1, j - (ind + 1));
					break;
				}
			}

			UnitTestUSFMTextToken tok = new UnitTestUSFMTextToken();
			tok.Offset = j;
			tok.BookText = text;
			if (IsParagraphStart(marker))
				tok.ParaStyleName = marker;
			else
				tok.CharStyleName = marker;

			return tok;
		}
示例#5
0
		/// ------------------------------------------------------------------------------------
		/// <summary>
		/// Clones this instance.
		/// </summary>
		/// ------------------------------------------------------------------------------------
		public ITextToken Clone()
		{
			UnitTestUSFMTextToken tok = new UnitTestUSFMTextToken();
			tok.ParaStyleName = this.ParaStyleName;
			tok.CharStyleName = this.CharStyleName;
			tok.BookText = this.BookText;
			tok.Offset = this.Offset;
			tok.Length = this.Length;
			tok.Chapter = this.Chapter;
			tok.Verse = this.Verse;
			tok.IsPublishableText = this.IsPublishableText;
			tok.IsNoteText = this.IsNoteText;
			tok.IsVerseText = this.IsVerseText;

			return tok;
		}