Пример #1
0
        private static bool IsVersePara(UsfmToken paraToken)
        {
            string style = paraToken.Marker.Marker;

            if (NonVerseParaStyles.Contains(style))
            {
                return(false);
            }

            if (IsNumberedStyle("ms", style))
            {
                return(false);
            }

            if (IsNumberedStyle("s", style))
            {
                return(false);
            }

            return(true);
        }
Пример #2
0
        public IEnumerable <UsfmToken> Parse(string usfm, bool preserveWhitespace = false)
        {
            List <UsfmToken> tokens = new List <UsfmToken>();

            int index = 0;                  // Current position

            while (index < usfm.Length)
            {
                int nextMarkerIndex = (index < usfm.Length - 1) ? usfm.IndexOf('\\', index + 1) : -1;
                if (nextMarkerIndex == -1)
                {
                    nextMarkerIndex = usfm.Length;
                }

                // If text, create text token until end or next \
                var ch = usfm[index];
                if (ch != '\\')
                {
                    string text = usfm.Substring(index, nextMarkerIndex - index);
                    if (!preserveWhitespace)
                    {
                        text = RegularizeSpaces(text);
                    }

                    tokens.Add(new UsfmToken(UsfmTokenType.Text, null, text));

                    index = nextMarkerIndex;
                    continue;
                }

                // Get marker (and move past whitespace or star ending)
                index++;
                int markerStart = index;
                while (index < usfm.Length)
                {
                    ch = usfm[index];

                    // Backslash starts a new marker
                    if (ch == '\\')
                    {
                        break;
                    }

                    // End star is part of marker
                    if (ch == '*')
                    {
                        index++;
                        break;
                    }

                    if (IsNonSemanticWhiteSpace(ch))
                    {
                        // Preserve whitespace if needed, otherwise skip
                        if (!preserveWhitespace)
                        {
                            index++;
                        }
                        break;
                    }
                    index++;
                }
                string markerStr = usfm.Substring(markerStart, index - markerStart).TrimEnd();

                // Multiple whitespace after non-end marker is ok
                if (!markerStr.EndsWith("*", StringComparison.Ordinal) && !preserveWhitespace)
                {
                    while ((index < usfm.Length) && IsNonSemanticWhiteSpace(usfm[index]))
                    {
                        index++;
                    }
                }

                // Lookup marker
                UsfmMarker marker = _stylesheet.GetMarker(markerStr.TrimStart('+'));

                // If starts with a plus and is not a character style, it is an unknown marker
                if (markerStr.StartsWith("+", StringComparison.Ordinal) && marker.StyleType != UsfmStyleType.Character)
                {
                    marker = _stylesheet.GetMarker(markerStr);
                }

                switch (marker.StyleType)
                {
                case UsfmStyleType.Character:
                    // Handle verse special case
                    if ((marker.TextProperties & UsfmTextProperties.Verse) > 0)
                    {
                        tokens.Add(new UsfmToken(UsfmTokenType.Verse, marker, GetNextWord(usfm, ref index, preserveWhitespace)));
                    }
                    else
                    {
                        tokens.Add(new UsfmToken(UsfmTokenType.Character, marker, null));
                    }
                    break;

                case UsfmStyleType.Paragraph:
                    // Handle chapter special case
                    if ((marker.TextProperties & UsfmTextProperties.Chapter) > 0)
                    {
                        tokens.Add(new UsfmToken(UsfmTokenType.Chapter, marker, GetNextWord(usfm, ref index, preserveWhitespace)));
                    }
                    else if ((marker.TextProperties & UsfmTextProperties.Book) > 0)
                    {
                        tokens.Add(new UsfmToken(UsfmTokenType.Book, marker, GetNextWord(usfm, ref index, preserveWhitespace)));
                    }
                    else
                    {
                        tokens.Add(new UsfmToken(UsfmTokenType.Paragraph, marker, null));
                    }
                    break;

                case UsfmStyleType.Note:
                    tokens.Add(new UsfmToken(UsfmTokenType.Note, marker, GetNextWord(usfm, ref index, preserveWhitespace)));
                    break;

                case UsfmStyleType.End:
                    tokens.Add(new UsfmToken(UsfmTokenType.End, marker, null));
                    break;

                case UsfmStyleType.Unknown:
                    // End tokens are always end tokens, even if unknown
                    if (markerStr.EndsWith("*", StringComparison.Ordinal))
                    {
                        tokens.Add(new UsfmToken(UsfmTokenType.End, marker, null));
                    }
                    else
                    {
                        // Handle special case of esb and esbe which might not be in basic stylesheet
                        // but are always sidebars and so should be tokenized as paragraphs
                        if (markerStr == "esb" || markerStr == "esbe")
                        {
                            tokens.Add(new UsfmToken(UsfmTokenType.Paragraph, marker, null));
                            break;
                        }
                        // Create unknown token with a corresponding end note
                        tokens.Add(new UsfmToken(UsfmTokenType.Unknown, marker, null));
                    }
                    break;
                }
            }

            // Forces a space to be present in tokenization if immediately
            // before a token requiring a preceeding CR/LF. This is to ensure
            // that when written to disk and re-read, that tokenization
            // will match. For example, "\p test\p here" requires a space
            // after "test". Also, "\p \em test\em*\p here" requires a space
            // token inserted after \em*
            if (!preserveWhitespace)
            {
                for (int i = 1; i < tokens.Count; i++)
                {
                    // If requires newline (verses do, except when after '(' or '[')
                    if (tokens[i].Type == UsfmTokenType.Book ||
                        tokens[i].Type == UsfmTokenType.Chapter ||
                        tokens[i].Type == UsfmTokenType.Paragraph ||
                        (tokens[i].Type == UsfmTokenType.Verse &&
                         !(tokens[i - 1].Type == UsfmTokenType.Text &&
                           (tokens[i - 1].Text.EndsWith("(", StringComparison.Ordinal) || tokens[i - 1].Text.EndsWith("[", StringComparison.Ordinal)))))
                    {
                        // Add space to text token
                        if (tokens[i - 1].Type == UsfmTokenType.Text)
                        {
                            if (!tokens[i - 1].Text.EndsWith(" ", StringComparison.Ordinal))
                            {
                                tokens[i - 1] = new UsfmToken(tokens[i - 1].Text + " ");
                            }
                        }
                        else if (tokens[i - 1].Type == UsfmTokenType.End)
                        {
                            // Insert space token after * of end marker
                            tokens.Insert(i, new UsfmToken(UsfmTokenType.Text, null, " "));
                            i++;
                        }
                    }
                }
            }

            return(tokens);
        }