示例#1
0
 public UsfmParserState(ScrStylesheet scrStylesheet, VerseRef verseRef)
 {
     ScrStylesheet = scrStylesheet;
     Stack         = new List <UsfmParserElement>();
     VerseRef      = verseRef.Clone();
     VerseOffset   = 0;
 }
示例#2
0
 /// <summary>
 /// Creates a USFM parser
 /// </summary>
 /// <param name="scrStylesheet"></param>
 /// <param name="tokens">list of tokens to parse</param>
 /// <param name="verseRef">initial reference for the parser</param>
 /// <param name="sink">optional sink to send parse events to. Null for none</param>
 public UsfmParser(ScrStylesheet scrStylesheet, List <UsfmToken> tokens, VerseRef verseRef, UsfmParserSink sink)
 {
     this.scrStylesheet = scrStylesheet;
     this.tokens        = tokens;
     this.state         = new UsfmParserState(scrStylesheet, verseRef);
     this.sink          = sink;
 }
示例#3
0
 public UsfmParserState(ScrStylesheet scrStylesheet)
 {
     ScrStylesheet = scrStylesheet;
     Stack         = new List <UsfmParserElement>();
     VerseRef      = new VerseRef();
     VerseOffset   = 0;
 }
示例#4
0
 public MarkerCheckSink(ScrStylesheet scrStylesheet, string currentBookId, MarkerCheck markerCheck)
 {
     this.scrStylesheet  = scrStylesheet;
     isRightToLeft       = false;
     stylesheetIsDefault = true;
     this.markerCheck    = markerCheck;
 }
示例#5
0
        public bool Run(int bookNum, string text)
        {
            CheckForNoSpaceBetweenMarkers(bookNum, text);
            ScrStylesheet    scrStylesheet = new ScrStylesheet("usfm.sty");
            List <UsfmToken> usfmTokens    = new ScrParser(scrStylesheet, text).GetUsfmTokens(bookNum);

            return(CheckInternal(usfmTokens, bookNum, scrStylesheet));
        }
示例#6
0
 /// <summary>
 /// Creates a USFM parser
 /// </summary>
 /// <param name="scrStylesheet"></param>
 /// <param name="tokens">list of tokens to parse</param>
 /// <param name="state">initial state of the parser</param>
 /// <param name="sink">optional sink to send parse events to. Null for none</param>
 /// <param name="tokensPreserveWhitespace">True if the tokens were created while preserving whitespace,
 /// false otherwise</param>
 public UsfmParser(ScrStylesheet scrStylesheet, List <UsfmToken> tokens, UsfmParserState state, UsfmParserSink sink,
                   bool tokensPreserveWhitespace = false)
 {
     this.scrStylesheet            = scrStylesheet;
     this.tokens                   = tokens;
     this.state                    = state;
     this.sink                     = sink;
     this.tokensPreserveWhitespace = tokensPreserveWhitespace;
 }
示例#7
0
        /// <summary>
        /// Creates a new stylesheet that is a union of the specified stylesheets. For any markers/tags that
        /// are defined in two or more stylesheets, this method will take the definition defined in the latter stylesheet.
        /// No merging of style properties are done. Also, this method does make "deep" copies of the
        /// styles since some stylesheets will be modified and we don't want to change the original stylesheet.
        /// </summary>
        public static ScrStylesheet MergeStylesheets(params ScrStylesheet[] stylesheets)
        {
            ScrStylesheet newStyleSheet = new ScrStylesheet(string.Join("-", stylesheets.Select(ss => ss.name)));

            foreach (ScrTag tag in stylesheets.SelectMany(ss => ss.Tags))
            {
                newStyleSheet.AddTagInternal(tag.Clone());
            }

            return(newStyleSheet);
        }
示例#8
0
        private bool CheckInternal(List <UsfmToken> tokens, int bookNum, ScrStylesheet scrStylesheet)
        {
            VerseRef startVerse = new VerseRef(bookNum, 1, 0, ScrVers.English);

            if (tokens.Count > 0 && tokens[0].Marker != "id")
            {
                RecordError(startVerse, "", 0, "#" + missingIdMarker);
            }

            MarkerCheckSink markerCheckSink = new MarkerCheckSink(scrStylesheet, startVerse.Book, this);
            UsfmParser      parser          = new UsfmParser(scrStylesheet, tokens, startVerse, markerCheckSink);

            parser.ProcessTokens();
            markerCheckSink.ReportPendingVerseNoParaError();
            markerCheckSink.ReportOpenMilestoneErrors();
            return(markerErrors || markerCheckSink.MarkerErrors);
        }
示例#9
0
        /// <summary>
        /// Fully normalize usfm by converting to tokens and back.
        /// Adds all appropriate CR/LF and removes double-spaces.
        /// </summary>
        public static string NormalizeUsfm(ScrStylesheet scrStylesheet, string usfm, bool preserveWhitespace, bool rtl)
        {
            // Build usfm string from tokens
            string dest;

            if (!preserveWhitespace)
            {
                List <UsfmToken> tokens = Tokenize(scrStylesheet, usfm, false);
                dest = NormalizeTokenUsfm(tokens, rtl);
            }
            else
            {
                dest = usfm;
            }

            return(AddRtlAsNeeded(dest, rtl));
        }
示例#10
0
 /// <summary>
 /// Creates a parser
 /// </summary>
 internal ScrParser(ScrStylesheet stylesheet, string text)
 {
     this.stylesheet = stylesheet;
     bookText        = text;
 }
示例#11
0
 /// <summary>
 /// Initializes the parser state
 /// </summary>
 /// <param name="scrStylesheet"></param>
 /// <param name="verseRef"></param>
 public ScrParserState(ScrStylesheet scrStylesheet, VerseRef verseRef)
     : base(scrStylesheet, verseRef)
 {
 }
示例#12
0
 /// <summary>
 /// Constructor for making a duplicate for looking ahead to find closing
 /// tokens of notes and character styles.
 /// </summary>
 UsfmParser(UsfmParser usfmParser, UsfmParserSink sink = null)
 {
     scrStylesheet = usfmParser.scrStylesheet;
     tokens        = usfmParser.tokens;
     this.sink     = sink;
 }
示例#13
0
 /// <summary>
 /// Fully normalize usfm by converting to tokens and back.
 /// Adds all appropriate CR/LF and removes double-spaces.
 /// </summary>
 /// <param name="scrStylesheet">book stylesheet</param>
 /// <param name="usfm">original usfm</param>
 /// <param name="rtl">true for right-to-left texts</param>
 /// <returns>normalized usfm</returns>
 public static string NormalizeUsfm(ScrStylesheet scrStylesheet, string usfm, bool rtl)
 {
     return(NormalizeUsfm(scrStylesheet, usfm, false, rtl));
 }
示例#14
0
        /// <summary>
        /// Tokenize the specified USFM text
        /// </summary>
        /// <param name="scrStylesheet">stylesheet to use</param>
        /// <param name="usfm">usfm string</param>
        /// <param name="preserveWhitespace">true to preserve all whitespaces verbatim in tokens</param>
        /// <returns>list of tokens</returns>
        public static List <UsfmToken> Tokenize(ScrStylesheet scrStylesheet, string usfm, bool preserveWhitespace)
        {
            List <UsfmToken> tokens = new List <UsfmToken>();
            UsfmToken        lastTokenWithAttributes = null;

            int index = 0;              // Current position

            while (index < usfm.Length)
            {
                int nextMarkerIndex = (index < usfm.Length - 1) ? usfm.IndexOf('\\', index + 1) : -1;
                if (nextMarkerIndex == -1)
                {
                    nextMarkerIndex = usfm.Length;
                }

                // If text, create text token until end or next \
                var ch = usfm[index];
                if (ch != '\\')
                {
                    string text = usfm.Substring(index, nextMarkerIndex - index);
                    if (!preserveWhitespace)
                    {
                        text = RegularizeSpaces(text);
                    }

                    lastTokenWithAttributes = null;
                    int attributeIndex = text.IndexOf('|');
                    if (attributeIndex >= 0)
                    {
                        UsfmToken matchingToken = FindMatchingStartMarker(usfm, tokens, nextMarkerIndex);
                        if (matchingToken != null)
                        {
                            ScrTag matchingTag = scrStylesheet.GetTag(matchingToken.NestlessMarker);
                            // leave attributes of other styles as regular text
                            if (matchingTag.StyleType == ScrStyleType.scCharacterStyle || matchingTag.StyleType == ScrStyleType.scMilestone ||
                                matchingTag.StyleType == ScrStyleType.scMilestoneEnd)
                            {
                                string adjustedText = text.Substring(0, attributeIndex);
                                if (matchingToken.SetAttributes(text.Substring(attributeIndex + 1),
                                                                matchingTag.DefaultAttribute, ref adjustedText, preserveWhitespace))
                                {
                                    text = adjustedText;
                                    // attributes for ending milestone are not copied from the beginning milestone, so don't update last token value
                                    if (matchingTag.StyleType == ScrStyleType.scCharacterStyle)
                                    {
                                        lastTokenWithAttributes = matchingToken;
                                    }
                                }
                            }
                        }
                    }

                    if (text.Length > 0)
                    {
                        tokens.Add(new UsfmToken(UsfmTokenType.Text, null, text, null));
                    }

                    index = nextMarkerIndex;
                    continue;
                }

                // Get marker (and move past whitespace or star ending)
                index++;
                int markerStart = index;
                while (index < usfm.Length)
                {
                    ch = usfm[index];

                    // Backslash starts a new marker
                    if (ch == '\\')
                    {
                        break;
                    }

                    // don't require a space before the | that starts attributes - mainly for milestones to allow \qt-s|speaker\*
                    if (ch == '|')
                    {
                        break;
                    }

                    // End star is part of marker
                    if (ch == '*')
                    {
                        index++;
                        break;
                    }

                    if (IsNonSemanticWhiteSpace(ch))
                    {
                        // Preserve whitespace if needed, otherwise skip
                        if (!preserveWhitespace)
                        {
                            index++;
                        }
                        break;
                    }
                    index++;
                }
                string marker = usfm.Substring(markerStart, index - markerStart).TrimEnd();
                // Milestone stop/end markers are ended with \*, so marker will just be * and can be skipped
                if (marker == "*")
                {
                    // make sure that previous token was a milestone - have to skip space only tokens that may have been added when
                    // preserveSpace is true.
                    UsfmToken prevToken = tokens.Count > 0 ? tokens.Last(t => t.Type != UsfmTokenType.Text || t.Text.Trim() != "") : null;
                    if (prevToken != null && (prevToken.Type == UsfmTokenType.Milestone ||
                                              prevToken.Type == UsfmTokenType.MilestoneEnd))
                    {
                        // if the last item is an empty text token, remove it so we don't get extra space.
                        if (tokens.Last().Type == UsfmTokenType.Text)
                        {
                            tokens.RemoveAt(tokens.Count - 1);
                        }
                        continue;
                    }
                }

                // Multiple whitespace after non-end marker is ok
                if (!marker.EndsWith("*", StringComparison.Ordinal) && !preserveWhitespace)
                {
                    while ((index < usfm.Length) && IsNonSemanticWhiteSpace(usfm[index]))
                    {
                        index++;
                    }
                }

                // Lookup tag
                ScrTag tag = scrStylesheet.GetTag(marker.TrimStart('+'));

                // If starts with a plus and is not a character style or an end style, it is an unknown tag
                if (marker.StartsWith("+", StringComparison.Ordinal) && tag.StyleType != ScrStyleType.scCharacterStyle && tag.StyleType != ScrStyleType.scEndStyle)
                {
                    tag = scrStylesheet.GetTag(marker);
                }

                // Note: Unless this is a milestone, tag.Marker and tag.EndMarker are ignored if maras the plus prefix must be kept
                // and the end marker is always marker + "*"
                string endMarker = tag.StyleType != ScrStyleType.scMilestone ? marker + "*" : tag.Endmarker;

                switch (tag.StyleType)
                {
                case ScrStyleType.scCharacterStyle:
                    // Handle verse special case
                    UsfmToken newToken;
                    if ((tag.TextProperties & TextProperties.scVerse) > 0)
                    {
                        newToken = new UsfmToken(UsfmTokenType.Verse, marker, null, null,
                                                 GetNextWord(usfm, ref index, preserveWhitespace));
                    }
                    else
                    {
                        newToken = new UsfmToken(UsfmTokenType.Character, marker, null, endMarker);
                    }
                    tokens.Add(newToken);
                    break;

                case ScrStyleType.scParagraphStyle:
                    // Handle chapter special case
                    if ((tag.TextProperties & TextProperties.scChapter) > 0)
                    {
                        tokens.Add(new UsfmToken(UsfmTokenType.Chapter, marker, null, null, GetNextWord(usfm, ref index, preserveWhitespace)));
                    }
                    else if ((tag.TextProperties & TextProperties.scBook) > 0)
                    {
                        tokens.Add(new UsfmToken(UsfmTokenType.Book, marker, null, null, GetNextWord(usfm, ref index, preserveWhitespace)));
                    }
                    else
                    {
                        tokens.Add(new UsfmToken(UsfmTokenType.Paragraph, marker, null, endMarker));
                    }
                    break;

                case ScrStyleType.scNoteStyle:
                    tokens.Add(new UsfmToken(UsfmTokenType.Note, marker, null, endMarker, GetNextWord(usfm, ref index, preserveWhitespace)));
                    break;

                case ScrStyleType.scEndStyle:
                    lastTokenWithAttributes = AddEndMarker(marker, tokens, lastTokenWithAttributes);
                    break;

                case ScrStyleType.scUnknownStyle:
                    // End tokens are always end tokens, even if unknown
                    if (marker.EndsWith("*", StringComparison.Ordinal))
                    {
                        lastTokenWithAttributes = AddEndMarker(marker, tokens, lastTokenWithAttributes);
                    }
                    else
                    {
                        // Handle special case of esb and esbe which might not be in basic stylesheet
                        // but are always sidebars and so should be tokenized as paragraphs
                        if (marker == "esb" || marker == "esbe")
                        {
                            tokens.Add(new UsfmToken(UsfmTokenType.Paragraph, marker, null, endMarker));
                            break;
                        }
                        // Create unknown token with a corresponding end note
                        tokens.Add(new UsfmToken(UsfmTokenType.Unknown, marker, null, marker + "*"));
                    }
                    break;

                case ScrStyleType.scMilestone:
                case ScrStyleType.scMilestoneEnd:
                    // if a milestone is not followed by a ending \* treat don't create a milestone token for the begining. Instead create at
                    // text token for all the text up to the beginning of the next marker. This will make typing of milestones easiest since
                    // the partially typed milestone more be reformatted to have a normal ending even if it hasn't been typed yet.
                    if (!MilestoneEnded(usfm, index))
                    {
                        int endOfText = (index < usfm.Length - 1) ? usfm.IndexOf('\\', index + 1) : -1;
                        if (endOfText == -1)
                        {
                            endOfText = usfm.Length;
                        }
                        string milestoneText = usfm.Substring(index, endOfText - index);
                        // add back space that was removed after marker
                        if (milestoneText.Length > 0 && milestoneText[0] != ' ' && milestoneText[0] != '|')
                        {
                            milestoneText = " " + milestoneText;
                        }
                        tokens.Add(new UsfmToken(UsfmTokenType.Text, null, @"\" + marker + milestoneText, null));
                        index = endOfText;
                    }
                    else if (tag.StyleType == ScrStyleType.scMilestone)
                    {
                        tokens.Add(new UsfmToken(UsfmTokenType.Milestone, marker, null, endMarker));
                    }
                    else
                    {
                        tokens.Add(new UsfmToken(UsfmTokenType.MilestoneEnd, marker, null, null));
                    }
                    break;

                default:
                    Debug.Fail("Unknown ScrStyleType");
                    break;
                }
            }

            // Forces a space to be present in tokenization if immediately
            // before a token requiring a preceeding CR/LF. This is to ensure
            // that when written to disk and re-read, that tokenization
            // will match. For example, "\p test\p here" requires a space
            // after "test". Also, "\p \em test\em*\p here" requires a space
            // token inserted after \em*
            if (!preserveWhitespace)
            {
                for (int i = 1; i < tokens.Count; i++)
                {
                    // If requires newline (verses do, except when after '(' or '[')
                    if (tokens[i].Type == UsfmTokenType.Book ||
                        tokens[i].Type == UsfmTokenType.Chapter ||
                        tokens[i].Type == UsfmTokenType.Paragraph ||
                        (tokens[i].Type == UsfmTokenType.Verse &&
                         !(tokens[i - 1].Type == UsfmTokenType.Text &&
                           (tokens[i - 1].Text.EndsWith("(", StringComparison.Ordinal) || tokens[i - 1].Text.EndsWith("[", StringComparison.Ordinal)))))
                    {
                        // Add space to text token
                        if (tokens[i - 1].Type == UsfmTokenType.Text)
                        {
                            if (!tokens[i - 1].Text.EndsWith(" ", StringComparison.Ordinal))
                            {
                                tokens[i - 1].Text = tokens[i - 1].Text + " ";
                            }
                        }
                        else if (tokens[i - 1].Type == UsfmTokenType.End)
                        {
                            // Insert space token after * of end marker
                            tokens.Insert(i, new UsfmToken(UsfmTokenType.Text, null, " ", null));
                            i++;
                        }
                    }
                }
            }

            return(tokens);
        }