public UsfmParserState(ScrStylesheet scrStylesheet, VerseRef verseRef) { ScrStylesheet = scrStylesheet; Stack = new List <UsfmParserElement>(); VerseRef = verseRef.Clone(); VerseOffset = 0; }
/// <summary> /// Creates a USFM parser /// </summary> /// <param name="scrStylesheet"></param> /// <param name="tokens">list of tokens to parse</param> /// <param name="verseRef">initial reference for the parser</param> /// <param name="sink">optional sink to send parse events to. Null for none</param> public UsfmParser(ScrStylesheet scrStylesheet, List <UsfmToken> tokens, VerseRef verseRef, UsfmParserSink sink) { this.scrStylesheet = scrStylesheet; this.tokens = tokens; this.state = new UsfmParserState(scrStylesheet, verseRef); this.sink = sink; }
public UsfmParserState(ScrStylesheet scrStylesheet) { ScrStylesheet = scrStylesheet; Stack = new List <UsfmParserElement>(); VerseRef = new VerseRef(); VerseOffset = 0; }
public MarkerCheckSink(ScrStylesheet scrStylesheet, string currentBookId, MarkerCheck markerCheck) { this.scrStylesheet = scrStylesheet; isRightToLeft = false; stylesheetIsDefault = true; this.markerCheck = markerCheck; }
public bool Run(int bookNum, string text) { CheckForNoSpaceBetweenMarkers(bookNum, text); ScrStylesheet scrStylesheet = new ScrStylesheet("usfm.sty"); List <UsfmToken> usfmTokens = new ScrParser(scrStylesheet, text).GetUsfmTokens(bookNum); return(CheckInternal(usfmTokens, bookNum, scrStylesheet)); }
/// <summary> /// Creates a USFM parser /// </summary> /// <param name="scrStylesheet"></param> /// <param name="tokens">list of tokens to parse</param> /// <param name="state">initial state of the parser</param> /// <param name="sink">optional sink to send parse events to. Null for none</param> /// <param name="tokensPreserveWhitespace">True if the tokens were created while preserving whitespace, /// false otherwise</param> public UsfmParser(ScrStylesheet scrStylesheet, List <UsfmToken> tokens, UsfmParserState state, UsfmParserSink sink, bool tokensPreserveWhitespace = false) { this.scrStylesheet = scrStylesheet; this.tokens = tokens; this.state = state; this.sink = sink; this.tokensPreserveWhitespace = tokensPreserveWhitespace; }
/// <summary> /// Creates a new stylesheet that is a union of the specified stylesheets. For any markers/tags that /// are defined in two or more stylesheets, this method will take the definition defined in the latter stylesheet. /// No merging of style properties are done. Also, this method does make "deep" copies of the /// styles since some stylesheets will be modified and we don't want to change the original stylesheet. /// </summary> public static ScrStylesheet MergeStylesheets(params ScrStylesheet[] stylesheets) { ScrStylesheet newStyleSheet = new ScrStylesheet(string.Join("-", stylesheets.Select(ss => ss.name))); foreach (ScrTag tag in stylesheets.SelectMany(ss => ss.Tags)) { newStyleSheet.AddTagInternal(tag.Clone()); } return(newStyleSheet); }
private bool CheckInternal(List <UsfmToken> tokens, int bookNum, ScrStylesheet scrStylesheet) { VerseRef startVerse = new VerseRef(bookNum, 1, 0, ScrVers.English); if (tokens.Count > 0 && tokens[0].Marker != "id") { RecordError(startVerse, "", 0, "#" + missingIdMarker); } MarkerCheckSink markerCheckSink = new MarkerCheckSink(scrStylesheet, startVerse.Book, this); UsfmParser parser = new UsfmParser(scrStylesheet, tokens, startVerse, markerCheckSink); parser.ProcessTokens(); markerCheckSink.ReportPendingVerseNoParaError(); markerCheckSink.ReportOpenMilestoneErrors(); return(markerErrors || markerCheckSink.MarkerErrors); }
/// <summary> /// Fully normalize usfm by converting to tokens and back. /// Adds all appropriate CR/LF and removes double-spaces. /// </summary> public static string NormalizeUsfm(ScrStylesheet scrStylesheet, string usfm, bool preserveWhitespace, bool rtl) { // Build usfm string from tokens string dest; if (!preserveWhitespace) { List <UsfmToken> tokens = Tokenize(scrStylesheet, usfm, false); dest = NormalizeTokenUsfm(tokens, rtl); } else { dest = usfm; } return(AddRtlAsNeeded(dest, rtl)); }
/// <summary> /// Creates a parser /// </summary> internal ScrParser(ScrStylesheet stylesheet, string text) { this.stylesheet = stylesheet; bookText = text; }
/// <summary> /// Initializes the parser state /// </summary> /// <param name="scrStylesheet"></param> /// <param name="verseRef"></param> public ScrParserState(ScrStylesheet scrStylesheet, VerseRef verseRef) : base(scrStylesheet, verseRef) { }
/// <summary> /// Constructor for making a duplicate for looking ahead to find closing /// tokens of notes and character styles. /// </summary> UsfmParser(UsfmParser usfmParser, UsfmParserSink sink = null) { scrStylesheet = usfmParser.scrStylesheet; tokens = usfmParser.tokens; this.sink = sink; }
/// <summary> /// Fully normalize usfm by converting to tokens and back. /// Adds all appropriate CR/LF and removes double-spaces. /// </summary> /// <param name="scrStylesheet">book stylesheet</param> /// <param name="usfm">original usfm</param> /// <param name="rtl">true for right-to-left texts</param> /// <returns>normalized usfm</returns> public static string NormalizeUsfm(ScrStylesheet scrStylesheet, string usfm, bool rtl) { return(NormalizeUsfm(scrStylesheet, usfm, false, rtl)); }
/// <summary> /// Tokenize the specified USFM text /// </summary> /// <param name="scrStylesheet">stylesheet to use</param> /// <param name="usfm">usfm string</param> /// <param name="preserveWhitespace">true to preserve all whitespaces verbatim in tokens</param> /// <returns>list of tokens</returns> public static List <UsfmToken> Tokenize(ScrStylesheet scrStylesheet, string usfm, bool preserveWhitespace) { List <UsfmToken> tokens = new List <UsfmToken>(); UsfmToken lastTokenWithAttributes = null; int index = 0; // Current position while (index < usfm.Length) { int nextMarkerIndex = (index < usfm.Length - 1) ? usfm.IndexOf('\\', index + 1) : -1; if (nextMarkerIndex == -1) { nextMarkerIndex = usfm.Length; } // If text, create text token until end or next \ var ch = usfm[index]; if (ch != '\\') { string text = usfm.Substring(index, nextMarkerIndex - index); if (!preserveWhitespace) { text = RegularizeSpaces(text); } lastTokenWithAttributes = null; int attributeIndex = text.IndexOf('|'); if (attributeIndex >= 0) { UsfmToken matchingToken = FindMatchingStartMarker(usfm, tokens, nextMarkerIndex); if (matchingToken != null) { ScrTag matchingTag = scrStylesheet.GetTag(matchingToken.NestlessMarker); // leave attributes of other styles as regular text if (matchingTag.StyleType == ScrStyleType.scCharacterStyle || matchingTag.StyleType == ScrStyleType.scMilestone || matchingTag.StyleType == ScrStyleType.scMilestoneEnd) { string adjustedText = text.Substring(0, attributeIndex); if (matchingToken.SetAttributes(text.Substring(attributeIndex + 1), matchingTag.DefaultAttribute, ref adjustedText, preserveWhitespace)) { text = adjustedText; // attributes for ending milestone are not copied from the beginning milestone, so don't update last token value if (matchingTag.StyleType == ScrStyleType.scCharacterStyle) { lastTokenWithAttributes = matchingToken; } } } } } if (text.Length > 0) { tokens.Add(new UsfmToken(UsfmTokenType.Text, null, text, null)); } index = nextMarkerIndex; continue; } // Get marker (and move past whitespace or star ending) index++; int markerStart = index; while (index < usfm.Length) { ch = usfm[index]; // Backslash starts a new marker if (ch == '\\') { break; } // don't require a space before the | that starts attributes - mainly for milestones to allow \qt-s|speaker\* if (ch == '|') { break; } // End star is part of marker if (ch == '*') { index++; break; } if (IsNonSemanticWhiteSpace(ch)) { // Preserve whitespace if needed, otherwise skip if (!preserveWhitespace) { index++; } break; } index++; } string marker = usfm.Substring(markerStart, index - markerStart).TrimEnd(); // Milestone stop/end markers are ended with \*, so marker will just be * and can be skipped if (marker == "*") { // make sure that previous token was a milestone - have to skip space only tokens that may have been added when // preserveSpace is true. UsfmToken prevToken = tokens.Count > 0 ? tokens.Last(t => t.Type != UsfmTokenType.Text || t.Text.Trim() != "") : null; if (prevToken != null && (prevToken.Type == UsfmTokenType.Milestone || prevToken.Type == UsfmTokenType.MilestoneEnd)) { // if the last item is an empty text token, remove it so we don't get extra space. if (tokens.Last().Type == UsfmTokenType.Text) { tokens.RemoveAt(tokens.Count - 1); } continue; } } // Multiple whitespace after non-end marker is ok if (!marker.EndsWith("*", StringComparison.Ordinal) && !preserveWhitespace) { while ((index < usfm.Length) && IsNonSemanticWhiteSpace(usfm[index])) { index++; } } // Lookup tag ScrTag tag = scrStylesheet.GetTag(marker.TrimStart('+')); // If starts with a plus and is not a character style or an end style, it is an unknown tag if (marker.StartsWith("+", StringComparison.Ordinal) && tag.StyleType != ScrStyleType.scCharacterStyle && tag.StyleType != ScrStyleType.scEndStyle) { tag = scrStylesheet.GetTag(marker); } // Note: Unless this is a milestone, tag.Marker and tag.EndMarker are ignored if maras the plus prefix must be kept // and the end marker is always marker + "*" string endMarker = tag.StyleType != ScrStyleType.scMilestone ? marker + "*" : tag.Endmarker; switch (tag.StyleType) { case ScrStyleType.scCharacterStyle: // Handle verse special case UsfmToken newToken; if ((tag.TextProperties & TextProperties.scVerse) > 0) { newToken = new UsfmToken(UsfmTokenType.Verse, marker, null, null, GetNextWord(usfm, ref index, preserveWhitespace)); } else { newToken = new UsfmToken(UsfmTokenType.Character, marker, null, endMarker); } tokens.Add(newToken); break; case ScrStyleType.scParagraphStyle: // Handle chapter special case if ((tag.TextProperties & TextProperties.scChapter) > 0) { tokens.Add(new UsfmToken(UsfmTokenType.Chapter, marker, null, null, GetNextWord(usfm, ref index, preserveWhitespace))); } else if ((tag.TextProperties & TextProperties.scBook) > 0) { tokens.Add(new UsfmToken(UsfmTokenType.Book, marker, null, null, GetNextWord(usfm, ref index, preserveWhitespace))); } else { tokens.Add(new UsfmToken(UsfmTokenType.Paragraph, marker, null, endMarker)); } break; case ScrStyleType.scNoteStyle: tokens.Add(new UsfmToken(UsfmTokenType.Note, marker, null, endMarker, GetNextWord(usfm, ref index, preserveWhitespace))); break; case ScrStyleType.scEndStyle: lastTokenWithAttributes = AddEndMarker(marker, tokens, lastTokenWithAttributes); break; case ScrStyleType.scUnknownStyle: // End tokens are always end tokens, even if unknown if (marker.EndsWith("*", StringComparison.Ordinal)) { lastTokenWithAttributes = AddEndMarker(marker, tokens, lastTokenWithAttributes); } else { // Handle special case of esb and esbe which might not be in basic stylesheet // but are always sidebars and so should be tokenized as paragraphs if (marker == "esb" || marker == "esbe") { tokens.Add(new UsfmToken(UsfmTokenType.Paragraph, marker, null, endMarker)); break; } // Create unknown token with a corresponding end note tokens.Add(new UsfmToken(UsfmTokenType.Unknown, marker, null, marker + "*")); } break; case ScrStyleType.scMilestone: case ScrStyleType.scMilestoneEnd: // if a milestone is not followed by a ending \* treat don't create a milestone token for the begining. Instead create at // text token for all the text up to the beginning of the next marker. This will make typing of milestones easiest since // the partially typed milestone more be reformatted to have a normal ending even if it hasn't been typed yet. if (!MilestoneEnded(usfm, index)) { int endOfText = (index < usfm.Length - 1) ? usfm.IndexOf('\\', index + 1) : -1; if (endOfText == -1) { endOfText = usfm.Length; } string milestoneText = usfm.Substring(index, endOfText - index); // add back space that was removed after marker if (milestoneText.Length > 0 && milestoneText[0] != ' ' && milestoneText[0] != '|') { milestoneText = " " + milestoneText; } tokens.Add(new UsfmToken(UsfmTokenType.Text, null, @"\" + marker + milestoneText, null)); index = endOfText; } else if (tag.StyleType == ScrStyleType.scMilestone) { tokens.Add(new UsfmToken(UsfmTokenType.Milestone, marker, null, endMarker)); } else { tokens.Add(new UsfmToken(UsfmTokenType.MilestoneEnd, marker, null, null)); } break; default: Debug.Fail("Unknown ScrStyleType"); break; } } // Forces a space to be present in tokenization if immediately // before a token requiring a preceeding CR/LF. This is to ensure // that when written to disk and re-read, that tokenization // will match. For example, "\p test\p here" requires a space // after "test". Also, "\p \em test\em*\p here" requires a space // token inserted after \em* if (!preserveWhitespace) { for (int i = 1; i < tokens.Count; i++) { // If requires newline (verses do, except when after '(' or '[') if (tokens[i].Type == UsfmTokenType.Book || tokens[i].Type == UsfmTokenType.Chapter || tokens[i].Type == UsfmTokenType.Paragraph || (tokens[i].Type == UsfmTokenType.Verse && !(tokens[i - 1].Type == UsfmTokenType.Text && (tokens[i - 1].Text.EndsWith("(", StringComparison.Ordinal) || tokens[i - 1].Text.EndsWith("[", StringComparison.Ordinal))))) { // Add space to text token if (tokens[i - 1].Type == UsfmTokenType.Text) { if (!tokens[i - 1].Text.EndsWith(" ", StringComparison.Ordinal)) { tokens[i - 1].Text = tokens[i - 1].Text + " "; } } else if (tokens[i - 1].Type == UsfmTokenType.End) { // Insert space token after * of end marker tokens.Insert(i, new UsfmToken(UsfmTokenType.Text, null, " ", null)); i++; } } } } return(tokens); }