Example #1
0
        /// <summary>
        /// Get specified text as a list of tokens.
        /// </summary>
        public List <UsfmToken> GetUsfmTokens(int bookNum)
        {
            List <UsfmToken> tokens;

            tokens = UsfmToken.Tokenize(stylesheet, bookText, false);
            return(tokens);
        }
Example #2
0
            private void CheckRubyGlossing(UsfmParserState state, string marker, NamedAttribute[] attributes)
            {
                var baseText = innerTextBuilder.Value?.ToString();

                if (string.IsNullOrEmpty(baseText))
                {
                    return;
                }
                string[] baseSequences = CharacterSequences(baseText).ToArray();
                var      glossText     = attributes?.FirstOrDefault(a => a.Name == AttributeName.Gloss)?.Value;

                // empty gloss text will result in a missing gloss attribute error, so just returning rather than creating 2 errors
                if (string.IsNullOrEmpty(glossText))
                {
                    return;
                }
                string[] glosses = UsfmToken.ParseRubyGlosses(glossText, false);
                if (baseSequences.Length > glosses.Length && glosses.Length != 1)
                {
                    RecordMarkerError(state, marker,
                                      Localizer.Str(@"Fewer ruby glosses than base text characters") + markerSlot);
                }
                else if (baseSequences.Length < glosses.Length)
                {
                    RecordMarkerError(state, marker,
                                      Localizer.Str(@"More ruby glosses than base text characters") + markerSlot);
                }
            }
Example #3
0
        private static void LookaheadParser(UsfmParserState state, UsfmParser lookaheadParser, string marker, out bool isTokenClosed)
        {
            // BEWARE: This method is fairly performance-critical
            // Determine current marker
            string endMarker = marker + "*";

            // Process tokens until either the start of the stack doesn't match (it was closed
            // improperly) or a matching close marker is found
            while (lookaheadParser.ProcessToken())
            {
                UsfmToken currentToken = lookaheadParser.tokens[lookaheadParser.index];

                // Check if same marker was reopened without a close
                bool reopened = currentToken.Marker == marker &&
                                lookaheadParser.State.Stack.SequenceEqual(state.Stack);
                if (reopened)
                {
                    isTokenClosed = false;
                    return;
                }

                // Check if beginning of stack is unchanged. If token is unclosed, it will be unchanged
                bool markerStillOpen = lookaheadParser.State.Stack.Take(state.Stack.Count).SequenceEqual(state.Stack);
                if (!markerStillOpen)
                {
                    // Record whether marker is an end for this marker
                    isTokenClosed = currentToken.Marker == endMarker && currentToken.Type == UsfmTokenType.End;
                    return;
                }
            }
            isTokenClosed = false;
        }
Example #4
0
 /// <summary>
 /// Determines if token is a jump to a reference
 /// </summary>
 /// <param name="token"></param>
 /// <returns></returns>
 bool IsRef(UsfmToken token)
 {
     return((index < tokens.Count - 2) &&
            (tokens[index + 1].Text != null) &&
            (tokens[index + 1].Text.Contains("|")) &&
            (tokens[index + 2].Type == UsfmTokenType.End) &&
            (tokens[index + 2].Marker == token.EndMarker) &&
            (token.Marker == "ref"));
 }
Example #5
0
            public override void Milestone(UsfmParserState state, string marker, bool startMilestone, NamedAttribute[] namedAttributes)
            {
                if (!markerCheck.allowVersion3Usfm)
                {
                    recordError(new VerseRef(state.VerseRef), "\\" + marker, state.VerseOffset,
                                GetErrorMessage(unsupportedMarkerMessage, marker));
                }

                Tuple <VerseRef, int, string> tuple;

                if (startMilestone)
                {
                    if (openMilestones.TryGetValue(marker, out tuple))
                    {
                        recordError(tuple.Item1, marker, tuple.Item2, "#" + missingMilestoneEnd + " \\" + marker);
                    }

                    openMilestones[marker] = new Tuple <VerseRef, int, string>(state.VerseRef.Clone(), state.VerseOffset, UsfmToken.GetAttribute(namedAttributes, AttributeName.Id));
                }
                else
                {
                    if (endMilestoneMarkerMap.Count == 0)
                    {
                        foreach (var tag in scrStylesheet.Tags.Where(t => t.StyleType == ScrStyleType.scMilestone))
                        {
                            endMilestoneMarkerMap[tag.Endmarker] = tag.Marker;
                        }
                    }

                    string startMarker = endMilestoneMarkerMap[marker];
                    if (openMilestones.TryGetValue(startMarker, out tuple))
                    {
                        if (tuple.Item3 != UsfmToken.GetAttribute(namedAttributes, AttributeName.Id))
                        {
                            recordError(tuple.Item1, marker, tuple.Item2, "#" + Localizer.Str("Id on start/end milestones do not match:") + " \\" + startMarker);
                        }
                        openMilestones.Remove(startMarker);
                    }
                    else
                    {
                        recordError(state.VerseRef, marker, state.VerseOffset, "#" + Localizer.Str("End milestone has no matching start:") + " \\" + marker);
                    }
                }

                ValidateAttributes(state, scrStylesheet.GetTag(marker), marker, namedAttributes ?? new NamedAttribute[0]);
            }
Example #6
0
 /// <summary>
 /// Determines if token is a cell of a table
 /// </summary>
 /// <param name="token"></param>
 /// <returns></returns>
 bool IsCell(UsfmToken token)
 {
     return(token.Type == UsfmTokenType.Character &&
            (token.Marker.StartsWith("th") || token.Marker.StartsWith("tc")) &&
            State.Stack.Exists(elem => elem.Type == UsfmElementTypes.Row));
 }
Example #7
0
        /// <summary>
        /// Processes a single token
        /// </summary>
        /// <returns>false if there were no more tokens process</returns>
        public bool ProcessToken()
        {
            // If past end
            if (index >= tokens.Count - 1)
            {
                return(false);
            }

            // Move to next token
            index++;

            // Update verse offset with previous token (since verse offset is from start of current token)
            if (index > 0)
            {
                State.VerseOffset += tokens[index - 1].GetLength(false, !tokensPreserveWhitespace);
            }

            // Skip over tokens that are to be skipped, ensuring that
            // SpecialToken state is true.
            if (skip > 0)
            {
                skip--;
                State.SpecialToken = true;
                return(true);
            }

            // Reset special token and figure status
            State.SpecialToken = false;

            UsfmToken token = tokens[index];

            // Switch unknown types to either character or paragraph
            UsfmTokenType tokenType = token.Type;

            if (tokenType == UsfmTokenType.Unknown)
            {
                tokenType = DetermineUnknownTokenType();
            }

            if (sink != null && !string.IsNullOrEmpty(token.Marker))
            {
                sink.GotMarker(State, token.Marker);
            }

            // Close open elements
            switch (tokenType)
            {
            case UsfmTokenType.Book:
            case UsfmTokenType.Chapter:
                CloseAll();
                break;

            case UsfmTokenType.Paragraph:
                // Handle special case of table rows
                if (token.Marker == "tr")
                {
                    // Close all but table and sidebar
                    while (State.Stack.Count > 0 &&
                           Peek().Type != UsfmElementTypes.Table &&
                           Peek().Type != UsfmElementTypes.Sidebar)
                    {
                        CloseElement();
                    }
                    break;
                }

                // Handle special case of sidebars
                if (token.Marker == "esb")
                {
                    // Close all
                    CloseAll();
                    break;
                }

                // Close all but sidebar
                while (State.Stack.Count > 0 && Peek().Type != UsfmElementTypes.Sidebar)
                {
                    CloseElement();
                }
                break;

            case UsfmTokenType.Character:
                // Handle special case of table cell
                if (IsCell(token))
                {
                    // Close until row
                    while (Peek().Type != UsfmElementTypes.Row)
                    {
                        CloseElement();
                    }
                    break;
                }

                // Handle refs
                if (IsRef(token))
                {
                    // Refs don't close anything
                    break;
                }

                // If non-nested character style, close all character styles
                if (!token.Marker.StartsWith("+"))
                {
                    CloseCharStyles();
                }
                break;

            case UsfmTokenType.Verse:
                CloseNote();
                break;

            case UsfmTokenType.Note:
                CloseNote();
                break;

            case UsfmTokenType.End:
                // If end marker for an active note
                if (State.Stack.Exists(e => e.Type == UsfmElementTypes.Note && (e.Marker + "*" == token.Marker)))
                {
                    CloseNote();
                    break;
                }

                // If end marker for a character style on stack, close it
                // If no matching end marker, close all character styles on top of stack
                UsfmParserElement elem;
                bool unmatched = true;
                while (State.Stack.Count > 0)
                {
                    elem = Peek();
                    if (elem.Type != UsfmElementTypes.Char)
                    {
                        break;
                    }
                    CloseElement();

                    // Determine if a + prefix is needed to close it (was nested char style)
                    bool plusPrefix = (State.Stack.Count > 0 && Peek().Type == UsfmElementTypes.Char);

                    // If is a match
                    if ((plusPrefix ? "+" : "") + elem.Marker + "*" == token.Marker)
                    {
                        unmatched = false;
                        break;
                    }
                }

                // Unmatched end marker
                if (unmatched)
                {
                    if (sink != null)
                    {
                        sink.Unmatched(State, token.Marker);
                    }
                }
                break;
            }

            // Handle tokens
            switch (tokenType)
            {
            case UsfmTokenType.Book:
                State.Stack.Add(new UsfmParserElement(UsfmElementTypes.Book, token.Marker));

                // Code is always upper case
                string code = token.Data[0].ToUpperInvariant();

                // Update verse ref. Leave book alone if not empty to prevent parsing errors
                // on books with bad id lines.
                if (State.VerseRef.Book == "" && Canon.BookIdToNumber(code) != 0)
                {
                    State.VerseRef.Book = code;
                }
                State.VerseRef.ChapterNum = 1;
                State.VerseRef.VerseNum   = 0;
                State.VerseOffset         = 0;

                // Book start.
                if (sink != null)
                {
                    sink.StartBook(State, token.Marker, code);
                }
                break;

            case UsfmTokenType.Chapter:
                // Get alternate chapter number
                string altChapter = null;
                string pubChapter = null;
                if (!InventoryMode)
                {
                    if (index < tokens.Count - 3 &&
                        tokens[index + 1].Marker == "ca" &&
                        tokens[index + 2].Text != null &&
                        tokens[index + 3].Marker == "ca*")
                    {
                        altChapter = tokens[index + 2].Text.Trim();
                        skip      += 3;

                        // Skip blank space after if present
                        if (index + skip < tokens.Count - 1 &&
                            tokens[index + skip + 1].Text != null &&
                            tokens[index + skip + 1].Text.Trim().Length == 0)
                        {
                            skip++;
                        }
                    }

                    // Get publishable chapter number
                    if (index + skip < tokens.Count - 2 &&
                        tokens[index + skip + 1].Marker == "cp" &&
                        tokens[index + skip + 2].Text != null)
                    {
                        pubChapter = tokens[index + skip + 2].Text.Trim();
                        skip      += 2;
                    }
                }

                // Chapter
                State.VerseRef.Chapter  = token.Data[0];
                State.VerseRef.VerseNum = 0;
                // Verse offset is not zeroed for chapter 1, as it is part of intro
                if (State.VerseRef.ChapterNum != 1)
                {
                    State.VerseOffset = 0;
                }

                if (sink != null)
                {
                    sink.Chapter(State, token.Data[0], token.Marker, altChapter, pubChapter);
                }
                break;

            case UsfmTokenType.Verse:
                string pubVerse = null;
                string altVerse = null;

                if (!InventoryMode)
                {
                    if (index < tokens.Count - 3 &&
                        tokens[index + 1].Marker == "va" &&
                        tokens[index + 2].Text != null &&
                        tokens[index + 3].Marker == "va*")
                    {
                        // Get alternate verse number
                        altVerse = tokens[index + 2].Text.Trim();
                        skip    += 3;
                    }
                    if (index + skip < tokens.Count - 3 &&
                        tokens[index + skip + 1].Marker == "vp" &&
                        tokens[index + skip + 2].Text != null &&
                        tokens[index + skip + 3].Marker == "vp*")
                    {
                        // Get publishable verse number
                        pubVerse = tokens[index + skip + 2].Text.Trim();
                        skip    += 3;
                    }
                }

                // Verse
                State.VerseRef.Verse = token.Data[0];
                State.VerseOffset    = 0;

                if (sink != null)
                {
                    sink.Verse(State, token.Data[0], token.Marker, altVerse, pubVerse);
                }
                break;

            case UsfmTokenType.Paragraph:
                // Handle special case of table rows
                if (token.Marker == "tr")
                {
                    // Start table if not open
                    if (State.Stack.TrueForAll(e => e.Type != UsfmElementTypes.Table))
                    {
                        State.Stack.Add(new UsfmParserElement(UsfmElementTypes.Table, null));
                        if (sink != null)
                        {
                            sink.StartTable(State);
                        }
                    }

                    State.Stack.Add(new UsfmParserElement(UsfmElementTypes.Row, token.Marker));

                    // Row start
                    if (sink != null)
                    {
                        sink.StartRow(State, token.Marker);
                    }
                    break;
                }

                // Handle special case of sidebars
                if (token.Marker == "esb")
                {
                    bool isClosed = IsStudyBibleItemClosed("esb", "esbe");

                    // TODO - see FB 23934
                    // Would like to only add start sidebar if it is closed - adding unclosed marker will cause
                    // an end marker to be created in an unexpected place in the editor.
                    // if (isClosed)
                    State.Stack.Add(new UsfmParserElement(UsfmElementTypes.Sidebar, token.Marker));

                    // Look for category
                    string sidebarCategory = null;
                    if (index < tokens.Count - 3 &&
                        tokens[index + 1].Marker == "cat" &&
                        tokens[index + 2].Text != null &&
                        tokens[index + 3].Marker == "cat*")
                    {
                        // Get category
                        sidebarCategory = tokens[index + 2].Text.Trim();
                        skip           += 3;
                    }

                    if (sink != null)
                    {
                        sink.StartSidebar(State, token.Marker, sidebarCategory, isClosed);
                    }
                    break;
                }

                // Close sidebar if in sidebar
                if (token.Marker == "esbe")
                {
                    if (State.Stack.Exists(e => e.Type == UsfmElementTypes.Sidebar))
                    {
                        CloseAll();
                    }
                    else if (sink != null)
                    {
                        sink.Unmatched(State, token.Marker);
                    }
                    break;
                }

                State.Stack.Add(new UsfmParserElement(UsfmElementTypes.Para, token.Marker));

                // Paragraph opening
                if (sink != null)
                {
                    sink.StartPara(State, token.Marker, token.Type == UsfmTokenType.Unknown);
                }
                break;

            case UsfmTokenType.Character:
                // Handle special case of table cells (treated as special character style)
                if (IsCell(token))
                {
                    string align = "start";
                    if (token.Marker.Length > 2 && token.Marker[2] == 'c')
                    {
                        align = "center";
                    }
                    else if (token.Marker.Length > 2 && token.Marker[2] == 'r')
                    {
                        align = "end";
                    }

                    State.Stack.Add(new UsfmParserElement(UsfmElementTypes.Cell, token.Marker));

                    if (sink != null)
                    {
                        sink.StartCell(State, token.Marker, align);
                    }
                    break;
                }

                if (IsRef(token))
                {
                    // xrefs are special tokens (they do not stand alone)
                    State.SpecialToken = true;

                    string display;
                    string target;
                    ParseDisplayAndTarget(out display, out target);

                    skip += 2;

                    if (sink != null)
                    {
                        sink.Ref(State, token.Marker, display, target);
                    }
                    break;
                }

                string actualMarker;
                bool   invalidMarker = false;
                if (token.Marker.StartsWith("+"))
                {
                    // Only strip + if properly nested
                    actualMarker = state.CharTag != null?token.Marker.TrimStart('+') : token.Marker;

                    invalidMarker = state.CharTag == null;
                }
                else
                {
                    actualMarker = token.Marker;
                }

                State.Stack.Add(new UsfmParserElement(UsfmElementTypes.Char, actualMarker, tokens[index].Attributes));
                if (sink != null)
                {
                    bool charIsClosed           = IsTokenClosed();
                    State.Stack.Last().IsClosed = charIsClosed;     // save for attribute check in Text method
                    sink.StartChar(State, actualMarker, charIsClosed,
                                   token.Type == UsfmTokenType.Unknown || invalidMarker, tokens[index].Attributes);
                }
                break;

            case UsfmTokenType.Note:
                // Look for category
                string noteCategory = null;
                if (index < tokens.Count - 3 &&
                    tokens[index + 1].Marker == "cat" &&
                    tokens[index + 2].Text != null &&
                    tokens[index + 3].Marker == "cat*")
                {
                    // Get category
                    noteCategory = tokens[index + 2].Text.Trim();
                    skip        += 3;
                }

                State.Stack.Add(new UsfmParserElement(UsfmElementTypes.Note, token.Marker));

                if (sink != null)
                {
                    sink.StartNote(State, token.Marker, token.Data[0], noteCategory, IsTokenClosed());
                }
                break;

            case UsfmTokenType.Text:
                string text = token.Text;

                // If last token before a paragraph, book or chapter, esb, esbe (both are paragraph types),
                // or at very end, strip final space
                // This is because USFM requires these to be on a new line, therefore adding whitespace
                if ((index == tokens.Count - 1 ||
                     tokens[index + 1].Type == UsfmTokenType.Paragraph ||
                     tokens[index + 1].Type == UsfmTokenType.Book ||
                     tokens[index + 1].Type == UsfmTokenType.Chapter) &&
                    text.Length > 0 && text[text.Length - 1] == ' ')
                {
                    text = text.Substring(0, text.Length - 1);
                }

                if (sink != null)
                {
                    // Replace ~ with nbsp
                    text = text.Replace('~', '\u00A0');

                    // Replace // with <optbreak/>
                    foreach (string str in optBreakSplitter.Split(text))
                    {
                        if (str == "//")
                        {
                            sink.OptBreak(state);
                        }
                        else
                        {
                            sink.Text(state, str);
                        }
                    }
                }
                break;

            case UsfmTokenType.Milestone:
            case UsfmTokenType.MilestoneEnd:
                // currently, parse state doesn't need to be update, so just inform the sink about the milestone.
                sink?.Milestone(state, token.Marker, token.Type == UsfmTokenType.Milestone, token.Attributes);
                break;
            }

            return(true);
        }