public void BasicTest(string input, string[] output, TestOptions options = null) { if (options == null) { options = new TestOptions(); } var outputList = new List <int> { 0 }; var customBreaker = new CustomBreaker(); customBreaker.SetNewBreakHandler(vis => outputList.Add(vis.LatestBreakAt)); //options customBreaker.BreakNumberAfterText = options.BreakNumberAfterText; customBreaker.EngBreakingEngine.SurrogatePairBreakingOption = options.SurrogatePairBreakingOption; // customBreaker.BreakWords(input); //customBreaker.CopyBreakResults(outputList); for (int i = 0; i < outputList.Count - 1; i++) { Assert.AreEqual ( output[i], input.Substring(outputList[i], outputList[i + 1] - outputList[i]) ); } }
private void cmdManaged_Click(object sender, EventArgs e) { //some lang eg. Thai, Lao, need dictionary breaking //we use dic data from icu-project //1. create dictionary based breaking engine //TODO: dic should be read once var dicProvider = new IcuSimpleTextFileDictionaryProvider() { DataDir = "../../../icu58/brkitr_src/dictionaries" }; CustomBreakerBuilder.Setup(dicProvider); CustomBreaker breaker1 = CustomBreakerBuilder.NewCustomBreaker(); char[] test = this.textBox1.Text.ToCharArray(); this.listBox1.Items.Clear(); breaker1.BreakWords(test, 0, test.Length); foreach (BreakSpan span in breaker1.GetBreakSpanIter()) { string s = new string(test, span.startAt, span.len); this.listBox1.Items.Add(span.startAt + " " + s); } }
public void EngEngine() { //Text source: https://en.wikibooks.org/wiki/French/Texts/Simple/Le_Corbeau_et_le_Renard const string Le_Corbeau_et_le_Renard = @" Maître Corbeau, sur un arbre perché, Tenait en son bec un fromage. Maître Renard, par l’odeur alléché, Lui tint à peu près ce langage : « Hé ! bonjour, Monsieur du Corbeau. Que vous êtes joli ! Que vous me semblez beau ! Sans mentir, si votre ramage Se rapporte à votre plumage, Vous êtes le Phénix des hôtes de ces bois. » A ces mots le Corbeau ne se sent pas de joie ; Et pour montrer sa belle voix, Il ouvre un large bec, laisse tomber sa proie. Le Renard s’en saisit, et dit : « Mon bon Monsieur, Apprenez que tout flatteur Vit aux dépens de celui qui l’écoute : Cette leçon vaut bien un fromage, sans doute. » Le Corbeau, honteux et confus, Jura, mais un peu tard, qu’on ne l’y prendrait plus."; const string Le_Corbeau_et_le_Renard__Broken = @" |Maître| |Corbeau|,| |sur| |un| |arbre| |perché|,| |Tenait| |en| |son| |bec| |un| |fromage.| |Maître| |Renard|,| |par| |l|’|odeur| |alléché|,| |Lui| |tint| |à| |peu| |près| |ce| |langage| |:| |«| |Hé| |!| |bonjour|,| |Monsieur| |du| |Corbeau.| |Que| |vous| |êtes| |joli| |!| |Que| |vous| |me| |semblez| |beau| |!| |Sans| |mentir|,| |si| |votre| |ramage| |Se| |rapporte| |à| |votre| |plumage|,| |Vous| |êtes| |le| |Phénix| |des| |hôtes| |de| |ces| |bois.| |»| |A| |ces| |mots| |le| |Corbeau| |ne| |se| |sent| |pas| |de| |joie| |;| |Et| |pour| |montrer| |sa| |belle| |voix|,| |Il| |ouvre| |un| |large| |bec|,| |laisse| |tomber| |sa| |proie.| |Le| |Renard| |s|’|en| |saisit|,| |et| |dit| |:| |«| |Mon| |bon| |Monsieur|,| |Apprenez| |que| |tout| |flatteur| |Vit| |aux| |dépens| |de| |celui| |qui| |l|’|écoute| |:| |Cette| |leçon| |vaut| |bien| |un| |fromage|,| |sans| |doute.| |»| |Le| |Corbeau|,| |honteux| |et| |confus|,| |Jura|,| |mais| |un| |peu| |tard|,| |qu|’|on| |ne| |l|’|y| |prendrait| |plus.|"; string BreakText(string text, string seperator = "|") { var breaker = new CustomBreaker { ThrowIfCharOutOfRange = true }; var breakList = new List<BreakAtInfo>(); #warning Use `breaker.BreakWords(text, breakList);` once #156 is merged breaker.BreakWords(text); breaker.CopyBreakResults(breakList); var sb = new StringBuilder(text); //reverse to ensure earlier inserts do not affect later ones foreach (var @break in breakList.Select(i => i.breakAt).Reverse()) sb = sb.Insert(@break, seperator); return sb.ToString(); } var brokenString = BreakText(Le_Corbeau_et_le_Renard); Assert.AreEqual(Le_Corbeau_et_le_Renard__Broken, brokenString); }
void ParseWithManaged(int ntimes) { //------------------- CustomBreakerBuilder.DataDir = "../../../icu58/brkitr_src/dictionaries"; CustomBreaker breaker1 = CustomBreakerBuilder.NewCustomBreaker(); char[] test = this.textBox1.Text.ToCharArray(); //------------- for (int i = ntimes - 1; i >= 0; --i) { breaker1.BreakWords(test, 0); foreach (var span in breaker1.GetBreakSpanIter()) { } } }
private void cmdManaged_Click(object sender, EventArgs e) { CustomBreakerBuilder.DataDir = "../../../icu58/brkitr_src/dictionaries"; CustomBreaker breaker1 = CustomBreakerBuilder.NewCustomBreaker(); //2. create dictionary based breaking engine // char[] test = this.textBox1.Text.ToCharArray(); this.listBox1.Items.Clear(); breaker1.BreakWords(test, 0); foreach (var span in breaker1.GetBreakSpanIter()) { string s = new string(test, span.startAt, span.len); this.listBox1.Items.Add(span.startAt + " " + s); } }
void InitNewCustomTextBreakerAndBreakWords(char[] inputBuffer) { //--------------------------- //we don't have to create a new text breaker everytime. //we can reuse it.*** //this is just a demonstration. //--------------------------- //some lang eg. Thai, Lao, need dictionary breaking //we use dic data from icu-project //1. create dictionary based breaking engine //TODO: dic should be read once var dicProvider = new IcuSimpleTextFileDictionaryProvider() { DataDir = "../../../icu62/brkitr" }; CustomBreakerBuilder.Setup(dicProvider); CustomBreaker breaker1 = CustomBreakerBuilder.NewCustomBreaker(); //when we want to break into a group of consecutive unicode ranges. (this does not use Dictionry breaker) breaker1.EngBreakingEngine.SurrogatePairBreakingOption = (SurrogatePairBreakingOption)cmbSurrogatePairBreakOptions.SelectedItem; breaker1.UseUnicodeRangeBreaker = chkUseUnicodeRangeBreaker.Checked; breaker1.BreakNumberAfterText = true; this.listBox1.Items.Clear(); breaker1.SetNewBreakHandler(vis => { BreakSpan span = vis.GetBreakSpan(); string s = new string(inputBuffer, span.startAt, span.len); this.listBox1.Items.Add(span.startAt + " " + s); }); breaker1.BreakWords(inputBuffer, 0, inputBuffer.Length); //foreach (BreakSpan span in breaker1.GetBreakSpanIter()) //{ // // this.listBox1.Items.Add(span.startAt + " " + s); //} }
void ParseWithManaged(int ntimes) { //------------------- var dicProvider = new IcuSimpleTextFileDictionaryProvider() { DataDir = "../../../icu58/brkitr_src" }; CustomBreakerBuilder.Setup(dicProvider); CustomBreaker breaker1 = CustomBreakerBuilder.NewCustomBreaker(); breaker1.SetNewBreakHandler(vis => { }); //just break, do nothing about result char[] test = this.textBox1.Text.ToCharArray(); //------------- for (int i = ntimes - 1; i >= 0; --i) { breaker1.BreakWords(test, 0, test.Length); } }
public void BasicTest(string input, string[] output) { var customBreaker = new CustomBreaker(); customBreaker.BreakWords(input, false); var outputList = new List <int> { 0 }; customBreaker.LoadBreakAtList(outputList); for (int i = 0; i < outputList.Count - 1; i++) { Assert.AreEqual ( output[i], input.Substring(outputList[i], outputList[i + 1] - outputList[i]) ); } }
public void WordKindTest() { var breaker = new CustomBreaker { ThrowIfCharOutOfRange = true }; var breakList = new List<BreakSpan>(); #warning Use `breaker.BreakWords("«Maître leçon»", breakList);` once #156 is merged breaker.BreakWords("«Maître leçon»"); breakList.AddRange(breaker.GetBreakSpanIter()); Assert.AreEqual(breakList.Count, 5); void BreakSpanEqual(BreakSpan actual, BreakSpan expected) { Assert.AreEqual(expected.startAt, actual.startAt); Assert.AreEqual(expected.len, actual.len); Assert.AreEqual(expected.wordKind, actual.wordKind); } BreakSpanEqual(breakList[0], new BreakSpan { startAt = 0, len = 1, wordKind = WordKind.Punc }); BreakSpanEqual(breakList[1], new BreakSpan { startAt = 1, len = 6, wordKind = WordKind.Text }); BreakSpanEqual(breakList[2], new BreakSpan { startAt = 7, len = 1, wordKind = WordKind.Whitespace }); BreakSpanEqual(breakList[3], new BreakSpan { startAt = 8, len = 5, wordKind = WordKind.Text }); BreakSpanEqual(breakList[4], new BreakSpan { startAt = 13, len = 1, wordKind = WordKind.Punc }); }
private void cmdManaged_Click(object sender, EventArgs e) { //some lang eg. Thai, Lao, need dictionary breaking //we use dic data from icu-project //1. create dictionary based breaking engine CustomBreaker breaker1 = CustomBreakerBuilder.NewCustomBreaker(); char[] test = this.textBox1.Text.ToCharArray(); this.listBox1.Items.Clear(); breaker1.BreakWords(test, 0, test.Length); foreach (BreakSpan span in breaker1.GetBreakSpanIter()) { string s = new string(test, span.startAt, span.len); this.listBox1.Items.Add(span.startAt + " " + s); } }
void ParseWithManaged(int ntimes) { //------------------- var dicProvider = new IcuSimpleTextFileDictionaryProvider() { DataDir = "../../../icu58/brkitr_src/dictionaries" }; CustomBreakerBuilder.Setup(dicProvider); CustomBreaker breaker1 = CustomBreakerBuilder.NewCustomBreaker(); char[] test = this.textBox1.Text.ToCharArray(); //------------- for (int i = ntimes - 1; i >= 0; --i) { breaker1.BreakWords(test, 0, test.Length); foreach (var span in breaker1.GetBreakSpanIter()) { } } }
void ParseWithManaged(int ntimes) { //------------------- var dicProvider = new IcuSimpleTextFileDictionaryProvider() { DataDir = "../../../icu58/brkitr_src" }; CustomBreakerBuilder.Setup(dicProvider); CustomBreaker breaker1 = CustomBreakerBuilder.NewCustomBreaker(); breaker1.UseUnicodeRangeBreaker = chkUseUnicodeRangeBreaker.Checked; breaker1.EngBreakingEngine.SurrogatePairBreakingOption = (SurrogatePairBreakingOption)cmbSurrogatePairBreakOptions.SelectedItem; breaker1.SetNewBreakHandler(vis => { }); //just break, do nothing about result char[] test = this.textBox1.Text.ToCharArray(); //------------- for (int i = ntimes - 1; i >= 0; --i) { breaker1.BreakWords(test, 0, test.Length); } }
private void button1_Click(object sender, EventArgs e) { //char[] test = this.textBox1.Text.ToCharArray(); //string test_str = "حب"; this.listBox1.Items.Clear(); string test_str = "یہ ایک (car) ہے۔"; char[] test = test_str.ToCharArray(); var dicProvider = new IcuSimpleTextFileDictionaryProvider() { DataDir = "../../../icu58/brkitr_src" }; CustomBreakerBuilder.Setup(dicProvider); CustomBreaker breaker1 = CustomBreakerBuilder.NewCustomBreaker(); breaker1.SetNewBreakHandler(vis => { BreakSpan span = vis.GetBreakSpan(); string s = new string(test, span.startAt, span.len); this.listBox1.Items.Add(span.startAt + " " + s); }); //just break, do nothing about result breaker1.BreakWords(test); //for (int i = 0; i < outputList.Count - 1; i++) //{ // Assert.AreEqual // ( // output[i], // input.Substring(outputList[i], outputList[i + 1] - outputList[i]) // ); //} }
public void BasicTest(string input, string[] output, bool breakNumberAfterText = false) { var customBreaker = new CustomBreaker(); customBreaker.BreakNumberAfterText = breakNumberAfterText; // customBreaker.BreakWords(input); // var outputList = new List <int> { 0 }; customBreaker.CopyBreakResults(outputList); for (int i = 0; i < outputList.Count - 1; i++) { Assert.AreEqual ( output[i], input.Substring(outputList[i], outputList[i + 1] - outputList[i]) ); } }
public static Result <TextAtom> Build(ReadOnlySpan <char> latexSource) { if (latexSource.IsEmpty) { return(new TextAtom.List(Array.Empty <TextAtom>(), 0)); } bool? displayMath = null; StringBuilder mathLaTeX = null; bool backslashEscape = false; bool afterCommand = false; //ignore spaces after command bool afterNewline = false; int dollarCount = 0; var globalAtoms = new TextAtomListBuilder(); var breakList = new List <BreakAtInfo>(); breaker.BreakWords(latexSource, breakList); Result CheckDollarCount(TextAtomListBuilder atoms) { switch (dollarCount) { case 0: break; case 1: dollarCount = 0; switch (displayMath) { case true: return("Cannot close display math mode with $"); case false: if (atoms.Add(mathLaTeX.ToString(), false).Error is string mathError) { return("[Math mode error] " + mathError); } mathLaTeX = null; displayMath = null; break; case null: mathLaTeX = new StringBuilder(); displayMath = false; break; } break; case 2: dollarCount = 0; switch (displayMath) { case true: if (atoms.Add(mathLaTeX.ToString(), true).Error is string mathError) { return("[Math mode error] " + mathError); } mathLaTeX = null; displayMath = null; break; case false: return("Cannot close inline math mode with $$"); case null: mathLaTeX = new StringBuilder(); displayMath = true; break; } break; default: return("Invalid number of $: " + dollarCount); } return(Ok()); } Result <int> BuildBreakList(ReadOnlySpan <char> latex, TextAtomListBuilder atoms, int i, bool oneCharOnly, char stopChar) { void ParagraphBreak() { atoms.Break(3); #warning Should the newline and space occupy the same range? atoms.TextLength -= 3; atoms.Add(Space.ParagraphIndent, 3); } for (; i < breakList.Count; i++) { void ObtainRange(ReadOnlySpan <char> latexInput, int index, out int start, out int end, out ReadOnlySpan <char> section, out WordKind kind) { (start, end) = (index == 0 ? 0 : breakList[index - 1].breakAt, breakList[index].breakAt); section = latexInput.Slice(start, end - start); kind = breakList[index].wordKind; } ObtainRange(latex, i, out var startAt, out var endAt, out var textSection, out var wordKind); bool SetPrevRange(ReadOnlySpan <char> latexInput, ref ReadOnlySpan <char> section) { bool success = i-- > 0; if (success) { ObtainRange(latexInput, i, out startAt, out endAt, out section, out wordKind); } return(success); } bool SetNextRange(ReadOnlySpan <char> latexInput, ref ReadOnlySpan <char> section) { bool success = ++i < breakList.Count; if (success) { ObtainRange(latexInput, i, out startAt, out endAt, out section, out wordKind); } return(success); } Result <TextAtom> ReadArgumentAtom(ReadOnlySpan <char> latexInput) { backslashEscape = false; var argAtoms = new TextAtomListBuilder(); if (BuildBreakList(latexInput, argAtoms, ++i, true, '\0').Bind(index => i = index).Error is string error) { return(error); } return(argAtoms.Build()); } SpanResult <char> ReadArgumentString(ReadOnlySpan <char> latexInput, ref ReadOnlySpan <char> section) { afterCommand = false; if (!SetNextRange(latexInput, ref section)) { return(Err("Missing argument")); } if (section.IsNot('{')) { return(Err("Missing {")); } int endingIndex = -1; //startAt + 1 to not start at the { we started at bool isEscape = false; for (int j = startAt + 1, bracketDepth = 0; j < latexInput.Length; j++) { if (latexInput[j] == '\\') { isEscape = true; } else if (latexInput[j] == '{' && !isEscape) { bracketDepth++; } else if (latexInput[j] == '}' && !isEscape) { if (bracketDepth > 0) { bracketDepth--; } else { endingIndex = j; break; } } else { isEscape = false; } } if (endingIndex == -1) { return(Err("Missing }")); } var resultText = latexInput.Slice(endAt, endingIndex - endAt); while (startAt < endingIndex) { _ = SetNextRange(latexInput, ref section); //this never fails because the above check } return(Ok(resultText)); } ReadOnlySpan <char> LookAheadForPunc(ReadOnlySpan <char> latexInput, ref ReadOnlySpan <char> section) { int start = endAt; while (SetNextRange(latexInput, ref section)) { if (wordKind != WordKind.Punc || SpecialChars.Contains(section[0])) { //We have overlooked by one SetPrevRange(latexInput, ref section); break; } } return(latexInput.Slice(start, endAt - start)); } //Nothing should be before dollar sign checking -- dollar sign checking uses continue; atoms.TextLength = startAt; if (textSection.Is('$')) { if (backslashEscape) { if (displayMath != null) { mathLaTeX.Append(@"\$"); } else { atoms.Add("$", LookAheadForPunc(latex, ref textSection)); } } else { dollarCount++; continue; } backslashEscape = false; } else { { if (CheckDollarCount(atoms).Error is string error) { return(error); } } if (!backslashEscape) { //Unescaped text section, inside display/inline math mode if (displayMath != null) { switch (textSection) { case var _ when textSection.Is('$'): throw new InvalidCodePathException("The $ case should have been accounted for."); case var _ when textSection.Is('\\'): backslashEscape = true; continue; default: mathLaTeX.Append(textSection); break; } } //Unescaped text section, not inside display/inline math mode else { switch (textSection) { case var _ when stopChar > 0 && textSection[0] == stopChar: return(Ok(i)); case var _ when textSection.Is('$'): throw new InvalidCodePathException("The $ case should have been accounted for."); case var _ when textSection.Is('\\'): backslashEscape = true; continue; case var _ when textSection.Is('#'): return("Unexpected command argument reference character # outside of new command definition (currently unsupported)"); case var _ when textSection.Is('^'): case var _ when textSection.Is('_'): return($"Unexpected script indicator {textSection[0]} outside of math mode"); case var _ when textSection.Is('&'): return($"Unexpected alignment tab character & outside of table environments"); case var _ when textSection.Is('~'): atoms.Add(); break; case var _ when textSection.Is('%'): var comment = new StringBuilder(); while (SetNextRange(latex, ref textSection) && wordKind != WordKind.NewLine) { comment.Append(textSection); } atoms.Comment(comment.ToString()); break; case var _ when textSection.Is('{'): if (BuildBreakList(latex, atoms, ++i, false, '}').Bind(index => i = index).Error is string error) { return(error); } break; case var _ when textSection.Is('}'): return("Unexpected }, unbalanced braces"); case var _ when wordKind == WordKind.NewLine: //Consume newlines after commands //Double newline == paragraph break if (afterNewline) { ParagraphBreak(); afterNewline = false; break; } else { atoms.Add(); afterNewline = true; continue; } case var _ when wordKind == WordKind.Whitespace: //Collpase spaces if (afterCommand) { continue; } else { atoms.Add(); } break; default: //Just ordinary text if (oneCharOnly) { if (startAt + 1 < endAt) //Only re-read if current break span is more than 1 long { i--; breakList[i] = new BreakAtInfo(breakList[i].breakAt + 1, breakList[i].wordKind); } //Need to allocate in the end :( //Don't look ahead for punc; we are looking for one char only atoms.Add(textSection[0].ToString(), default(ReadOnlySpan <char>)); } else { atoms.Add(textSection.ToString(), LookAheadForPunc(latex, ref textSection)); } break; } } afterCommand = false; } //Escaped text section but in inline/display math mode else if (displayMath != null) { switch (textSection) { case var _ when textSection.Is('$'): throw new InvalidCodePathException("The $ case should have been accounted for."); case var _ when textSection.Is('('): switch (displayMath) { case true: return("Cannot open inline math mode in display math mode"); case false: return("Cannot open inline math mode in inline math mode"); default: throw new InvalidCodePathException("displayMath is null. This switch should not be hit."); } case var _ when textSection.Is(')'): switch (displayMath) { case true: return("Cannot close inline math mode in display math mode"); case false: if (atoms.Add(mathLaTeX.ToString(), false).Error is string mathError) { return("[Math mode error] " + mathError); } mathLaTeX = null; displayMath = null; break; default: throw new InvalidCodePathException("displayMath is null. This switch should not be hit."); } break; case var _ when textSection.Is('['): switch (displayMath) { case true: return("Cannot open display math mode in display math mode"); case false: return("Cannot open display math mode in inline math mode"); default: throw new InvalidCodePathException("displayMath is null. This switch should not be hit."); } case var _ when textSection.Is(']'): switch (displayMath) { case true: if (atoms.Add(mathLaTeX.ToString(), true).Error is string mathError) { return("[Math mode error] " + mathError); } mathLaTeX = null; displayMath = null; break; case false: return("Cannot close display math mode in inline math mode"); default: throw new InvalidCodePathException("displayMath is null. This switch should not be hit."); } break; default: mathLaTeX.Append('\\').Append(textSection); break; } backslashEscape = false; } else { //Escaped text section and not in inline/display math mode afterCommand = true; switch (textSection) { case var _ when textSection.Is('('): mathLaTeX = new StringBuilder(); displayMath = false; break; case var _ when textSection.Is(')'): return("Cannot close inline math mode outside of math mode"); case var _ when textSection.Is('['): mathLaTeX = new StringBuilder(); displayMath = true; break; case var _ when textSection.Is(']'): return("Cannot close display math mode outside of math mode"); case var _ when textSection.Is('\\'): atoms.Break(1); break; case var _ when textSection.Is(','): atoms.Add(Space.ShortSpace, 1); break; case var _ when textSection.Is(':') || textSection.Is('>'): atoms.Add(Space.MediumSpace, 1); break; case var _ when textSection.Is(';'): atoms.Add(Space.LongSpace, 1); break; case var _ when textSection.Is('!'): atoms.Add(-Space.ShortSpace, 1); break; case var _ when wordKind == WordKind.Whitespace: //control space atoms.Add(); break; case var _ when textSection.Is("par"): ParagraphBreak(); break; case var _ when textSection.Is("fontsize"): { if (ReadArgumentString(latex, ref textSection).Bind(fontSize => { if (fontSize.Length > StringArgumentLimit) { return(Err($"Length of font size has over {StringArgumentLimit} characters. Please shorten it.")); } Span <byte> charBytes = stackalloc byte[fontSize.Length]; for (int j = 0; j < fontSize.Length; j++) { if (fontSize[j] > 127) { return(Err("Invalid font size")); } charBytes[j] = (byte)fontSize[j]; } return(System.Buffers.Text.Utf8Parser.TryParse(charBytes, out float parsedResult, out _, 'f') ? Ok(parsedResult) : Err("Invalid font size")); }).Bind( ReadArgumentAtom(latex), (fontSize, resizedContent) => atoms.Add(resizedContent, fontSize, "fontsize".Length) ).Error is string error ) { return(error); } break; } case var _ when textSection.Is("color"): { if (ReadArgumentString(latex, ref textSection).Bind(color => color.Length > StringArgumentLimit ? Err($"Length of color has over {StringArgumentLimit} characters. Please shorten it.") : Color.Create(color, !NoEnhancedColors) is Color value ? Ok(value) : Err("Invalid color") ).Bind( ReadArgumentAtom(latex), (color, coloredContent) => atoms.Add(coloredContent, color, "color".Length) ).Error is string error ) { return(error); } break; } //case "red", "yellow", ... case var shortColor when !NoEnhancedColors && shortColor.TryAccessDictionary(Color.PredefinedColors, out var color): { int tmp_commandLength = shortColor.Length; if (ReadArgumentAtom(latex).Bind( coloredContent => atoms.Add(coloredContent, color, tmp_commandLength) ).Error is string error ) { return(error); } break; } //case "textbf", "textit", ... bool ValidTextStyle(ReadOnlySpan <char> textStyle, out FontStyle fontStyle) { fontStyle = default; if (textStyle.Length > 3 && textStyle[0] == 'm' && textStyle[1] == 'a' && textStyle[2] == 't' && textStyle[3] == 'h') { return(false); } Span <char> copy = stackalloc char[textStyle.Length]; textStyle.CopyTo(copy); if (textStyle.Length > 3 && textStyle[0] == 't' && textStyle[1] == 'e' && textStyle[2] == 'x' && textStyle[3] == 't') { copy[0] = 'm'; copy[1] = 'a'; copy[2] = 't'; copy[3] = 'h'; } return(((ReadOnlySpan <char>)copy).TryAccessDictionary(FontStyleExtensions.FontStyles, out fontStyle)); } case var textStyle when ValidTextStyle(textStyle, out var fontStyle): { int tmp_commandLength = textStyle.Length; if (ReadArgumentAtom(latex) .Bind(builtContent => atoms.Add(builtContent, fontStyle, tmp_commandLength)) .Error is string error) { return(error); } break; } //case "^", "\"", ... case var textAccent when textAccent.TryAccessDictionary(TextAtoms.PredefinedAccents, out var accent): { int tmp_commandLength = textAccent.Length; if (ReadArgumentAtom(latex) .Bind(builtContent => atoms.Add(builtContent, accent, tmp_commandLength)) .Error is string error) { return(error); } break; } //case "textasciicircum", "textless", ... case var textSymbol when textSymbol.TryAccessDictionary(TextAtoms.PredefinedTextSymbols, out var replaceResult): atoms.Add(replaceResult, LookAheadForPunc(latex, ref textSection)); break; case var command: if (displayMath != null) { mathLaTeX.Append(command); //don't eat the command when parsing math } else { return($@"Unknown command \{command.ToString()}"); } break; } backslashEscape = false; } } afterNewline = false; if (oneCharOnly) { return(Ok(i)); } } if (backslashEscape) { return(@"Unknown command \"); } if (stopChar > 0) { return(stopChar == '}' ? "Expected }, unbalanced braces" : $@"Expected {stopChar}"); } return(Ok(i)); } { if (BuildBreakList(latexSource, globalAtoms, 0, false, '\0').Error is string error) { return(error); } } { if (CheckDollarCount(globalAtoms).Error is string error) { return(error); } } if (displayMath != null) { return("Math mode was not terminated"); } return(globalAtoms.Build()); }
public static Result <TextAtom> TextAtomFromLaTeX(string latexSource) { if (string.IsNullOrEmpty(latexSource)) { return(new TextAtom.List(Array.Empty <TextAtom>())); } int endAt = 0; bool?displayMath = null; var mathLaTeX = new StringBuilder(); bool backslashEscape = false; bool afterCommand = false; // ignore spaces after command bool afterNewline = false; int dollarCount = 0; var globalAtoms = new TextAtomListBuilder(); List <BreakAtInfo> breakList = new List <BreakAtInfo>(); // Roslyn bug that assumes breakList is nullable resulting in warnings so var is not used var breaker = new CustomBreaker(v => breakList.Add(new BreakAtInfo(v.LatestBreakAt, v.LatestWordKind))) { BreakNumberAfterText = true, ThrowIfCharOutOfRange = false }; breaker.BreakWords(latexSource); Result CheckDollarCount(int startAt, ref int endAt, TextAtomListBuilder atoms) { switch (dollarCount) { case 0: break; case 1: dollarCount = 0; switch (displayMath) { case true: return("Cannot close display math mode with $"); case false: if (atoms.Math(mathLaTeX.ToString(), false, startAt, ref endAt).Error is string error) { return(error); } mathLaTeX.Clear(); displayMath = null; break; case null: displayMath = false; break; } break; case 2: dollarCount = 0; switch (displayMath) { case true: if (atoms.Math(mathLaTeX.ToString(), true, startAt - 1, ref endAt).Error is string error) { return(error); } mathLaTeX.Clear(); displayMath = null; break; case false: return("Cannot close inline math mode with $$"); case null: displayMath = true; break; } break; default: return("Invalid number of $: " + dollarCount); } return(Ok()); } Result <int> BuildBreakList(ReadOnlySpan <char> latex, TextAtomListBuilder atoms, int i, bool oneCharOnly, char stopChar) { void ParagraphBreak() { atoms.Break(); atoms.Space(Space.ParagraphIndent); } for (; i < breakList.Count; i++) { void ObtainSection(ReadOnlySpan <char> latexInput, int index, out int start, out int end, out ReadOnlySpan <char> section, out WordKind kind) { (start, end) = (index == 0 ? 0 : breakList[index - 1].breakAt, breakList[index].breakAt); section = latexInput.Slice(start, end - start); kind = breakList[index].wordKind; } ObtainSection(latex, i, out var startAt, out endAt, out var textSection, out var wordKind); bool PreviousSection(ReadOnlySpan <char> latexInput, ref ReadOnlySpan <char> section) { bool success = i-- > 0; if (success) { ObtainSection(latexInput, i, out startAt, out endAt, out section, out wordKind); } return(success); } bool NextSection(ReadOnlySpan <char> latexInput, ref ReadOnlySpan <char> section) { bool success = ++i < breakList.Count; if (success) { ObtainSection(latexInput, i, out startAt, out endAt, out section, out wordKind); } return(success); } Result <TextAtom> ReadArgumentAtom(ReadOnlySpan <char> latexInput) { backslashEscape = false; var argAtoms = new TextAtomListBuilder(); return(BuildBreakList(latexInput, argAtoms, ++i, true, '\0') .Bind(index => { i = index; return argAtoms.Build(); })); } SpanResult <char> ReadArgumentString(ReadOnlySpan <char> latexInput, ref ReadOnlySpan <char> section) { afterCommand = false; if (!NextSection(latexInput, ref section)) { return(Err("Missing argument")); } if (section.IsNot('{')) { return(Err("Missing {")); } int endingIndex = -1; //startAt + 1 to not start at the { we started at bool isEscape = false; for (int j = startAt + 1, bracketDepth = 0; j < latexInput.Length; j++) { if (latexInput[j] == '\\') { isEscape = true; } else if (latexInput[j] == '{' && !isEscape) { bracketDepth++; } else if (latexInput[j] == '}' && !isEscape) { if (bracketDepth > 0) { bracketDepth--; } else { endingIndex = j; break; } } else { isEscape = false; } } if (endingIndex == -1) { return(Err("Missing }")); } var resultText = latexInput.Slice(endAt, endingIndex - endAt); while (startAt < endingIndex) { _ = NextSection(latexInput, ref section); //this never fails because the above check } return(Ok(resultText)); } Result <Color> ReadColor(ReadOnlySpan <char> latexInput, ref ReadOnlySpan <char> section) => ReadArgumentString(latexInput, ref section).Bind(color => Color.Create(color, !NoEnhancedColors) is Color value ? Ok(value) : Err("Invalid color: " + color.ToString()) ); ///<summary>Get punctutation after current section</summary> ReadOnlySpan <char> NextSectionWhilePunc(ReadOnlySpan <char> latexInput, ref ReadOnlySpan <char> section) { int start = endAt; ReadOnlySpan <char> specialChars = stackalloc[] { '#', '$', '%', '&', '\\', '^', '_', '{', '}', '~' }; while (NextSection(latexInput, ref section)) { if (wordKind != WordKind.Punc || specialChars.IndexOf(section[0]) != -1) { // We have overlooked by one when non-punctuation or special character is encountered PreviousSection(latexInput, ref section); break; } } return(latexInput.Slice(start, endAt - start)); } //Nothing should be before dollar sign checking -- dollar sign checking uses continue; atoms.TextLength = startAt; if (textSection.Is('$')) { if (backslashEscape) { if (displayMath != null) { mathLaTeX.Append(@"\$"); } else { atoms.Text("$", NextSectionWhilePunc(latex, ref textSection)); } } else { dollarCount++; continue; } backslashEscape = false; } else { { if (CheckDollarCount(startAt, ref endAt, atoms).Error is string error) { return(error); } } switch (backslashEscape, displayMath) {
/* //Paste this into the C# Interactive, fill <username> yourself #r "C:/Users/<username>/source/repos/CSharpMath/Typography/Build/NetStandard/Typography.TextBreak/bin/Debug/netstandard1.3/Typography.TextBreak.dll" using Typography.TextBreak; (int, WordKind, char)[] BreakText(string text) { var breaker = new CustomBreaker(); var breakList = new List<BreakAtInfo>(); breaker.BreakWords(text); breaker.LoadBreakAtList(breakList); //index is after the boundary -> last one will be out of range return breakList.Select(i => (i.breakAt, i.wordKind, text.ElementAtOrDefault(i.breakAt))).ToArray(); } BreakText(@"Here are some text $1 + 12 \frac23 \sqrt4$ $$Display$$ text") */ /* //Version 2 #r "C:/Users/<username>/source/repos/CSharpMath/Typography/Build/NetStandard/Typography.TextBreak/bin/Debug/netstandard1.3/Typography.TextBreak.dll" using Typography.TextBreak; string BreakText(string text, string seperator = "|") { var breaker = new CustomBreaker(); var breakList = new List<BreakAtInfo>(); breaker.BreakWords(text); breaker.LoadBreakAtList(breakList); //reverse to ensure earlier inserts do not affect later ones foreach (var @break in breakList.Select(i => i.breakAt).Reverse()) text = text.Insert(@break, seperator); return text; } BreakText(@"Here are some text $1 + 12 \frac23 \sqrt4$ $$Display$$ text") */ public static Result<TextAtom> Build(string text, bool enhancedColors) { if (string.IsNullOrEmpty(text)) return new TextAtom.List(Array.Empty<TextAtom>(), 0); bool? displayMath = null; StringBuilder mathLaTeX = null; bool backslashEscape = false; bool afterCommand = false; //ignore spaces after command int dollarCount = 0; var atoms = new TextAtomListBuilder(); var breaker = new CustomBreaker(); var breakList = new List<BreakAtInfo>(); breaker.BreakWords(text, false); breaker.LoadBreakAtList(breakList); Result CheckDollarCount() { switch (dollarCount) { case 0: break; case 1: dollarCount = 0; switch (displayMath) { case true: return "Cannot close display math mode with $"; case false: if (atoms.Add(mathLaTeX.ToString(), false).Error is string mathError) return "[Math mode error] " + mathError; mathLaTeX = null; displayMath = null; break; case null: mathLaTeX = new StringBuilder(); displayMath = false; break; } break; case 2: dollarCount = 0; switch (displayMath) { case true: if (atoms.Add(mathLaTeX.ToString(), true).Error is string mathError) return "[Math mode error] " + mathError; mathLaTeX = null; displayMath = null; break; case false: return "Cannot close inline math mode with $$"; case null: mathLaTeX = new StringBuilder(); displayMath = true; break; } break; default: return "Invalid number of $: " + dollarCount; } return Ok(); } (int startAt, int endAt, char endingChar, WordKind wordKind) ObtainRange(int i) => (i == 0 ? 0 : breakList[i - 1].breakAt, breakList[i].breakAt, text[breakList[i].breakAt - 1], breakList[i].wordKind); for (var i = 0; i < breakList.Count; i++) { var (startAt, endAt, endingChar, wordKind) = ObtainRange(i); bool SetNextRange() { bool success = ++i < breakList.Count; if(success) (startAt, endAt, endingChar, wordKind) = ObtainRange(i); return success; } Result<string> ReadArgument() { afterCommand = false; if (!SetNextRange()) return Err("Missing argument"); if (endingChar != '{') { var toReturn = text[startAt].ToString(); #warning Not one char only, should skip spaces then read next char, and it is a possible command //range contains one char only if (startAt == endAt) _ = SetNextRange(); //reaching the end does not affect validity of argument else startAt += 1; return Ok(toReturn); } int endingIndex = -1; //startAt + 1 to not start at the { we started at for (int j = startAt + 1, bracketDepth = 0; j < text.Length; j++) { if (text[j] == '{') bracketDepth++; else if (text[j] == '}') if (bracketDepth > 0) bracketDepth--; else { endingIndex = j; break; } } if (endingIndex == -1) return Err("Missing }"); var resultText = text.Substring(endAt, endingIndex - endAt); while (startAt < endingIndex) _ = SetNextRange(); //this never fails because the above check return Ok(resultText); } atoms.TextLength = startAt; if (endingChar == '$') { if (backslashEscape) if (displayMath != null) mathLaTeX.Append(@"\$"); else atoms.Add("$"); else { dollarCount++; continue; } backslashEscape = false; } else { { if (CheckDollarCount().Error is string error) return error; } //Normal unescaped text section, could be in display/inline math mode if (!backslashEscape) { var textSection = text.Substring(startAt, endAt - startAt); switch (endingChar) { case '$': throw new InvalidCodePathException("The $ case should have been accounted for."); case '\\': backslashEscape = true; continue; case var sp when wordKind == WordKind.Whitespace || wordKind == WordKind.NewLine: //Collpase spaces //Consume newlines after commands if (displayMath == null) if (afterCommand) continue; else atoms.Add(); else mathLaTeX.Append(textSection); break; case var punc when displayMath == null && wordKind == WordKind.Punc && atoms.Last is TextAtom.Text t: //Append punctuation to text t.Append(textSection); break; default: //Just ordinary text if (displayMath == null) atoms.Add(textSection); else mathLaTeX.Append(textSection); break; } afterCommand = false; continue; } //Escaped text section but in inline/display math mode if (displayMath != null) { switch (endingChar) { case '$': throw new InvalidCodePathException("The $ case should have been accounted for."); case '(': switch (displayMath) { case true: return "Cannot open inline math mode in display math mode"; case false: return "Cannot open inline math mode in inline math mode"; default: throw new InvalidCodePathException("displayMath is null. This switch should not be hit."); } case ')': switch (displayMath) { case true: return "Cannot close inline math mode in display math mode"; case false: if (atoms.Add(mathLaTeX.ToString(), false).Error is string mathError) return "[Math mode error] " + mathError; mathLaTeX = null; displayMath = null; break; default: throw new InvalidCodePathException("displayMath is null. This switch should not be hit."); } break; case '[': switch (displayMath) { case true: return "Cannot open display math mode in display math mode"; case false: return "Cannot open display math mode in inline math mode"; default: throw new InvalidCodePathException("displayMath is null. This switch should not be hit."); } case ']': switch (displayMath) { case true: if (atoms.Add(mathLaTeX.ToString(), true).Error is string mathError) return "[Math mode error] " + mathError; mathLaTeX = null; displayMath = null; break; case false: return "Cannot close display math mode in inline math mode"; default: throw new InvalidCodePathException("displayMath is null. This switch should not be hit."); } break; default: mathLaTeX.Append($@"\{text.Substring(startAt, endAt - startAt)}"); break; } backslashEscape = false; continue; } //Escaped text section and not in inline/display math mode afterCommand = true; switch (text.Substring(startAt, endAt - startAt)) { case "(": mathLaTeX = new StringBuilder(); displayMath = false; break; case ")": return "Cannot close inline math mode outside of math mode"; case "[": mathLaTeX = new StringBuilder(); displayMath = true; break; case "]": return "Cannot close display math mode outside of math mode"; case @"\": atoms.Break(1); break; case ",": atoms.Add(Space.ShortSpace, 1); break; case var _ when wordKind == WordKind.Whitespace: //control space atoms.Add(); break; case "backslash": atoms.Add(@"\"); break; case "par": atoms.Break(3); #warning Should the newline and space occupy the same range? atoms.TextLength -= 3; atoms.Add(Space.ParagraphIndent, 3); break; case "fontsize": { if (ReadArgument().Bind(fontSize => float.TryParse(fontSize, System.Globalization.NumberStyles.AllowDecimalPoint | System.Globalization.NumberStyles.AllowLeadingWhite | System.Globalization.NumberStyles.AllowTrailingWhite, System.Globalization.CultureInfo.InvariantCulture, out var parsedResult) ? Ok(parsedResult) : Err("Invalid font size") ).Bind( ReadArgument().Bind(resizedContent => Build(resizedContent, enhancedColors)), (fontSize, resizedContent) => atoms.Add(resizedContent, fontSize, "fontsize".Length) ).Error is string error ) return error; break; } case "color": { if (ReadArgument().Bind(color => Color.Create(color, enhancedColors) is Color value ? Ok(value) : Err("Invalid color") ).Bind( ReadArgument().Bind(coloredContent => Build(coloredContent, enhancedColors)), (color, coloredContent) => atoms.Add(coloredContent, color, "color".Length) ).Error is string error ) return error; break; } //case "red", "yellow", ... case var shortColor when enhancedColors && Color.PredefinedColors.Contains(shortColor): { if (Ok(Color.Create(shortColor, enhancedColors) ?? throw new InvalidCodePathException( "This case's condition should have checked the validity of shortColor.") ).Bind( ReadArgument().Bind(coloredContent => Build(coloredContent, enhancedColors)), (color, coloredContent) => atoms.Add(coloredContent, color, shortColor.Length) ).Error is string error ) return error; break; } //case "textbf", "textit", ... case var command when !command.Contains("math") && FontStyleExtensions.FontStyles.TryGetByFirst(command.Replace("text", "math"), out var fontStyle): { if (ReadArgument() .Bind(content => Build(content, enhancedColors)) .Bind(builtContent => atoms.Add(builtContent, fontStyle, command.Length)) .Error is string error) return error; break; } case var command: if (displayMath != null) mathLaTeX.Append(command); //don't eat the command when parsing math else return @"Unknown command \" + command; break; } backslashEscape = false; } } { if (CheckDollarCount().Error is string error) return error; } if (backslashEscape) return @"Unknown command \"; if (displayMath != null) return "Math mode was not terminated"; return atoms.Build(); }
public void DoBreak(char[] inputBuffer, int startIndex, int len, List <int> breakAtList) { _breakAtList = breakAtList; // _textBreaker.BreakWords(inputBuffer, startIndex, len); }
public void DoBreak(char[] inputBuffer, int startIndex, int len, List <int> breakAtList) { myTextBreaker.BreakWords(inputBuffer, startIndex, len); myTextBreaker.LoadBreakAtList(breakAtList); }
public static Result <TextAtom> TextAtomFromLaTeX(string latexSource) { if (string.IsNullOrEmpty(latexSource)) { return(new TextAtom.List(Array.Empty <TextAtom>(), 0)); } bool? displayMath = null; StringBuilder mathLaTeX = null; bool backslashEscape = false; bool afterCommand = false; //ignore spaces after command bool afterNewline = false; int dollarCount = 0; var globalAtoms = new TextAtomListBuilder(); var breakList = new List <BreakAtInfo>(); breaker.SetNewBreakHandler(v => breakList.Add(new BreakAtInfo(v.LatestBreakAt, v.LatestWordKind))); breaker.BreakWords(latexSource); Result CheckDollarCount(TextAtomListBuilder atoms) { switch (dollarCount) { case 0: break; case 1: dollarCount = 0; switch (displayMath) { case true: return("Cannot close display math mode with $"); case false: if (atoms.Math(mathLaTeX.ToString(), false).Error is string mathError) { return("[Math mode error] " + mathError); } mathLaTeX = null; displayMath = null; break; case null: mathLaTeX = new StringBuilder(); displayMath = false; break; } break; case 2: dollarCount = 0; switch (displayMath) { case true: if (atoms.Math(mathLaTeX.ToString(), true).Error is string mathError) { return("[Math mode error] " + mathError); } mathLaTeX = null; displayMath = null; break; case false: return("Cannot close inline math mode with $$"); case null: mathLaTeX = new StringBuilder(); displayMath = true; break; } break; default: return("Invalid number of $: " + dollarCount); } return(Ok()); } Result <int> BuildBreakList(ReadOnlySpan <char> latex, TextAtomListBuilder atoms, int i, bool oneCharOnly, char stopChar) { void ParagraphBreak() { atoms.Break(3); #warning Should the newline and space occupy the same range? atoms.TextLength -= 3; atoms.Space(Space.ParagraphIndent, 3); } for (; i < breakList.Count; i++) { void ObtainSection(ReadOnlySpan <char> latexInput, int index, out int start, out int end, out ReadOnlySpan <char> section, out WordKind kind) { (start, end) = (index == 0 ? 0 : breakList[index - 1].breakAt, breakList[index].breakAt); section = latexInput.Slice(start, end - start); kind = breakList[index].wordKind; } ObtainSection(latex, i, out var startAt, out var endAt, out var textSection, out var wordKind); bool PreviousSection(ReadOnlySpan <char> latexInput, ref ReadOnlySpan <char> section) { bool success = i-- > 0; if (success) { ObtainSection(latexInput, i, out startAt, out endAt, out section, out wordKind); } return(success); } bool NextSection(ReadOnlySpan <char> latexInput, ref ReadOnlySpan <char> section) { bool success = ++i < breakList.Count; if (success) { ObtainSection(latexInput, i, out startAt, out endAt, out section, out wordKind); } return(success); } Result <TextAtom.List> ReadArgumentAtom(ReadOnlySpan <char> latexInput) { backslashEscape = false; var argAtoms = new TextAtomListBuilder(); return(BuildBreakList(latexInput, argAtoms, ++i, true, '\0') .Bind(index => { i = index; return argAtoms.Build(); })); } SpanResult <char> ReadArgumentString(ReadOnlySpan <char> latexInput, ref ReadOnlySpan <char> section) { afterCommand = false; if (!NextSection(latexInput, ref section)) { return(Err("Missing argument")); } if (section.IsNot('{')) { return(Err("Missing {")); } int endingIndex = -1; //startAt + 1 to not start at the { we started at bool isEscape = false; for (int j = startAt + 1, bracketDepth = 0; j < latexInput.Length; j++) { if (latexInput[j] == '\\') { isEscape = true; } else if (latexInput[j] == '{' && !isEscape) { bracketDepth++; } else if (latexInput[j] == '}' && !isEscape) { if (bracketDepth > 0) { bracketDepth--; } else { endingIndex = j; break; } } else { isEscape = false; } } if (endingIndex == -1) { return(Err("Missing }")); } var resultText = latexInput.Slice(endAt, endingIndex - endAt); while (startAt < endingIndex) { _ = NextSection(latexInput, ref section); //this never fails because the above check } return(Ok(resultText)); } ReadOnlySpan <char> NextSectionUntilPunc(ReadOnlySpan <char> latexInput, ref ReadOnlySpan <char> section) { int start = endAt; ReadOnlySpan <char> specialChars = stackalloc[] { '#', '$', '%', '&', '\\', '^', '_', '{', '}', '~' }; while (NextSection(latexInput, ref section)) { if (wordKind != WordKind.Punc || specialChars.IndexOf(section[0]) != -1) { //We have overlooked by one PreviousSection(latexInput, ref section); break; } } return(latexInput.Slice(start, endAt - start)); } //Nothing should be before dollar sign checking -- dollar sign checking uses continue; atoms.TextLength = startAt; if (textSection.Is('$')) { if (backslashEscape) { if (displayMath != null) { mathLaTeX.Append(@"\$"); } else { atoms.Text("$", NextSectionUntilPunc(latex, ref textSection)); } } else { dollarCount++; continue; } backslashEscape = false; } else { { if (CheckDollarCount(atoms).Error is string error) { return(error); } } if (!backslashEscape) { //Unescaped text section, inside display/inline math mode if (displayMath != null) { switch (textSection) { case var _ when textSection.Is('$'): throw new InvalidCodePathException("The $ case should have been accounted for."); case var _ when textSection.Is('\\'): backslashEscape = true; continue; default: mathLaTeX.Append(textSection); break; } } //Unescaped text section, not inside display/inline math mode else { switch (textSection) { case var _ when stopChar > 0 && textSection[0] == stopChar: return(Ok(i)); case var _ when textSection.Is('$'): throw new InvalidCodePathException("The $ case should have been accounted for."); case var _ when textSection.Is('\\'): backslashEscape = true; continue; case var _ when textSection.Is('#'): return("Unexpected command argument reference character # outside of new command definition (currently unsupported)"); case var _ when textSection.Is('^'): case var _ when textSection.Is('_'): return($"Unexpected script indicator {textSection[0]} outside of math mode"); case var _ when textSection.Is('&'): return($"Unexpected alignment tab character & outside of table environments"); case var _ when textSection.Is('~'): atoms.ControlSpace(); break; case var _ when textSection.Is('%'): var comment = new StringBuilder(); while (NextSection(latex, ref textSection) && wordKind != WordKind.NewLine) { comment.Append(textSection); } atoms.Comment(comment.ToString()); break; case var _ when textSection.Is('{'): if (BuildBreakList(latex, atoms, ++i, false, '}').Bind(index => i = index).Error is string error) { return(error); } break; case var _ when textSection.Is('}'): return("Unexpected }, unbalanced braces"); case var _ when wordKind == WordKind.NewLine: // Consume newlines after commands // Double newline == paragraph break if (afterNewline) { ParagraphBreak(); afterNewline = false; break; } else { atoms.ControlSpace(); afterNewline = true; continue; } case var _ when wordKind == WordKind.Whitespace: //Collpase spaces if (afterCommand) { continue; } else { atoms.ControlSpace(); } break; default: //Just ordinary text if (oneCharOnly) { if (startAt + 1 < endAt) //Only re-read if current break span is more than 1 long { i--; breakList[i] = new BreakAtInfo(breakList[i].breakAt + 1, breakList[i].wordKind); } //Need to allocate in the end :( //Don't look ahead for punc; we are looking for one char only atoms.Text(textSection[0].ToString(), default); } else { atoms.Text(textSection.ToString(), NextSectionUntilPunc(latex, ref textSection)); } break; } } afterCommand = false; } //Escaped text section but in inline/display math mode else if (displayMath != null) { switch (textSection) { case var _ when textSection.Is('$'): throw new InvalidCodePathException("The $ case should have been accounted for."); case var _ when textSection.Is('('): return(displayMath switch { true => "Cannot open inline math mode in display math mode", false => "Cannot open inline math mode in inline math mode", null => throw new InvalidCodePathException("displayMath is null. This switch should not be hit."), });
public static Result <TextAtom> Build(string latex) { if (string.IsNullOrEmpty(latex)) { return(new TextAtom.List(Array.Empty <TextAtom>(), 0)); } bool? displayMath = null; StringBuilder mathLaTeX = null; bool backslashEscape = false; bool afterCommand = false; //ignore spaces after command bool afterNewline = false; int dollarCount = 0; var globalAtoms = new TextAtomListBuilder(); var breaker = new CustomBreaker { BreakNumberAfterText = true, ThrowIfCharOutOfRange = false }; var breakList = new List <BreakAtInfo>(); breaker.BreakWords(latex); breaker.CopyBreakResults(breakList); Result CheckDollarCount(TextAtomListBuilder atoms) { switch (dollarCount) { case 0: break; case 1: dollarCount = 0; switch (displayMath) { case true: return("Cannot close display math mode with $"); case false: if (atoms.Add(mathLaTeX.ToString(), false).Error is string mathError) { return("[Math mode error] " + mathError); } mathLaTeX = null; displayMath = null; break; case null: mathLaTeX = new StringBuilder(); displayMath = false; break; } break; case 2: dollarCount = 0; switch (displayMath) { case true: if (atoms.Add(mathLaTeX.ToString(), true).Error is string mathError) { return("[Math mode error] " + mathError); } mathLaTeX = null; displayMath = null; break; case false: return("Cannot close inline math mode with $$"); case null: mathLaTeX = new StringBuilder(); displayMath = true; break; } break; default: return("Invalid number of $: " + dollarCount); } return(Ok()); } Result <int> BuildBreakList(TextAtomListBuilder atoms, int i, bool oneCharOnly, char stopChar) { void ParagraphBreak() { atoms.Break(3); #warning Should the newline and space occupy the same range? atoms.TextLength -= 3; atoms.Add(Space.ParagraphIndent, 3); } for (; i < breakList.Count; i++) { (int startAt, int endAt, string textSection, WordKind wordKind) ObtainRange(int index) { var(start, end) = (index == 0 ? 0 : breakList[index - 1].breakAt, breakList[index].breakAt); return(start, end, latex.Substring(start, end - start), breakList[index].wordKind); } var(startAt, endAt, textSection, wordKind) = ObtainRange(i); bool SetNextRange() { bool success = ++i < breakList.Count; if (success) { (startAt, endAt, textSection, wordKind) = ObtainRange(i); } return(success); } Result <TextAtom> ReadArgumentAtom() { backslashEscape = false; var argAtoms = new TextAtomListBuilder(); if (BuildBreakList(argAtoms, ++i, true, '\0').Bind(index => i = index).Error is string error) { return(error); } return(argAtoms.Build()); } Result <string> ReadArgumentString() { afterCommand = false; if (!SetNextRange()) { return(Err("Missing argument")); } if (textSection != "{") { return(Err("Missing {")); } int endingIndex = -1; //startAt + 1 to not start at the { we started at bool isEscape = false; for (int j = startAt + 1, bracketDepth = 0; j < latex.Length; j++) { if (latex[j] == '\\') { isEscape = true; } else if (latex[j] == '{' && !isEscape) { bracketDepth++; } else if (latex[j] == '}' && !isEscape) { if (bracketDepth > 0) { bracketDepth--; } else { endingIndex = j; break; } } else { isEscape = false; } } if (endingIndex == -1) { return(Err("Missing }")); } var resultText = latex.Substring(endAt, endingIndex - endAt); while (startAt < endingIndex) { _ = SetNextRange(); //this never fails because the above check } return(Ok(resultText)); } atoms.TextLength = startAt; if (stopChar > 0 && textSection[0] == stopChar) { return(Ok(i)); } if (textSection == "$") { if (backslashEscape) { if (displayMath != null) { mathLaTeX.Append(@"\$"); } else { atoms.Add("$"); } } else { dollarCount++; continue; } backslashEscape = false; } else { { if (CheckDollarCount(atoms).Error is string error) { return(error); } } if (!backslashEscape) { //Unescaped text section, inside display/inline math mode if (displayMath != null) { switch (textSection) { case "$": throw new InvalidCodePathException("The $ case should have been accounted for."); case "\\": backslashEscape = true; continue; default: mathLaTeX.Append(textSection); break; } } //Unescaped text section, not inside display/inline math mode else { switch (textSection) { case "$": throw new InvalidCodePathException("The $ case should have been accounted for."); case "\\": backslashEscape = true; continue; case "#": return("Unexpected command argument reference character # outside of new command definition (currently unsupported)"); case "^": case "_": return($"Unexpected script indicator {textSection} outside of math mode"); case "&": return($"Unexpected alignment tab character & outside of table environments"); case "~": atoms.Add(); break; case "%": var comment = new StringBuilder(); while (SetNextRange() && wordKind != WordKind.NewLine) { comment.Append(textSection); } atoms.Comment(comment.ToString()); break; case "{": if (BuildBreakList(atoms, ++i, false, '}').Bind(index => i = index).Error is string error) { return(error); } break; case "}": return("Unexpected }, unbalanced braces"); case var _ when wordKind == WordKind.NewLine: //Consume newlines after commands //Double newline == paragraph break if (afterNewline) { ParagraphBreak(); afterNewline = false; break; } else { atoms.Add(); afterNewline = true; continue; } case var _ when wordKind == WordKind.Whitespace: //Collpase spaces if (afterCommand) { continue; } else { atoms.Add(); } break; case var punc when wordKind == WordKind.Punc && atoms.Last is TextAtom.Text t: //Append punctuation to text t.Append(textSection); break; default: //Just ordinary text if (oneCharOnly) { if (startAt + 1 < endAt) //Only re-read if current break span is more than 1 long { i--; breakList[i] = new BreakAtInfo(breakList[i].breakAt + 1, breakList[i].wordKind); } atoms.Add(textSection[0].ToString()); } else { atoms.Add(textSection); } break; } } afterCommand = false; } //Escaped text section but in inline/display math mode else if (displayMath != null) { switch (textSection) { case "$": throw new InvalidCodePathException("The $ case should have been accounted for."); case "(": switch (displayMath) { case true: return("Cannot open inline math mode in display math mode"); case false: return("Cannot open inline math mode in inline math mode"); default: throw new InvalidCodePathException("displayMath is null. This switch should not be hit."); } case ")": switch (displayMath) { case true: return("Cannot close inline math mode in display math mode"); case false: if (atoms.Add(mathLaTeX.ToString(), false).Error is string mathError) { return("[Math mode error] " + mathError); } mathLaTeX = null; displayMath = null; break; default: throw new InvalidCodePathException("displayMath is null. This switch should not be hit."); } break; case "[": switch (displayMath) { case true: return("Cannot open display math mode in display math mode"); case false: return("Cannot open display math mode in inline math mode"); default: throw new InvalidCodePathException("displayMath is null. This switch should not be hit."); } case "]": switch (displayMath) { case true: if (atoms.Add(mathLaTeX.ToString(), true).Error is string mathError) { return("[Math mode error] " + mathError); } mathLaTeX = null; displayMath = null; break; case false: return("Cannot close display math mode in inline math mode"); default: throw new InvalidCodePathException("displayMath is null. This switch should not be hit."); } break; default: mathLaTeX.Append($@"\{textSection}"); break; } backslashEscape = false; } else { //Escaped text section and not in inline/display math mode afterCommand = true; switch (textSection) { case "(": mathLaTeX = new StringBuilder(); displayMath = false; break; case ")": return("Cannot close inline math mode outside of math mode"); case "[": mathLaTeX = new StringBuilder(); displayMath = true; break; case "]": return("Cannot close display math mode outside of math mode"); case @"\": atoms.Break(1); break; case ",": atoms.Add(Space.ShortSpace, 1); break; case var _ when wordKind == WordKind.Whitespace: //control space atoms.Add(); break; case "par": ParagraphBreak(); break; case "fontsize": { if (ReadArgumentString().Bind(fontSize => float.TryParse(fontSize, System.Globalization.NumberStyles.AllowDecimalPoint | System.Globalization.NumberStyles.AllowLeadingWhite | System.Globalization.NumberStyles.AllowTrailingWhite, System.Globalization.CultureInfo.InvariantCulture, out var parsedResult) ? Ok(parsedResult) : Err("Invalid font size") ).Bind( ReadArgumentAtom(), (fontSize, resizedContent) => atoms.Add(resizedContent, fontSize, "fontsize".Length) ).Error is string error ) { return(error); } break; } case "color": { if (ReadArgumentString().Bind(color => Color.Create(color, !NoEnhancedColors) is Color value ? Ok(value) : Err("Invalid color") ).Bind( ReadArgumentAtom(), (color, coloredContent) => atoms.Add(coloredContent, color, "color".Length) ).Error is string error ) { return(error); } break; } //case "red", "yellow", ... case var shortColor when !NoEnhancedColors && Color.PredefinedColors.Contains(shortColor): { if (Ok(Color.Create(shortColor, !NoEnhancedColors) ?? throw new InvalidCodePathException( "This case's condition should have checked the validity of shortColor.") ).Bind( ReadArgumentAtom(), (color, coloredContent) => atoms.Add(coloredContent, color, shortColor.Length) ).Error is string error ) { return(error); } break; } //case "textbf", "textit", ... case var textStyle when !textStyle.Contains("math") && FontStyleExtensions.FontStyles.TryGetByFirst(textStyle.Replace("text", "math"), out var fontStyle): { if (ReadArgumentAtom() .Bind(builtContent => atoms.Add(builtContent, fontStyle, textStyle.Length)) .Error is string error) { return(error); } break; } //case "^", "\"", ... case var textAccent when TextAtoms.PredefinedAccents.TryGetByFirst(textAccent, out var accent): { if (ReadArgumentAtom() .Bind(builtContent => atoms.Add(builtContent, accent, textAccent.Length)) .Error is string error) { return(error); } break; } //case "textasciicircum", "textless", ... case var textSymbol when TextAtoms.PredefinedTextSymbols.TryGetValue(textSymbol, out var replaceResult): atoms.Add(replaceResult); break; case var command: if (displayMath != null) { mathLaTeX.Append(command); //don't eat the command when parsing math } else { return(@"Unknown command \" + command); } break; } backslashEscape = false; } } afterNewline = false; if (oneCharOnly) { return(Ok(i)); } } if (backslashEscape) { return(@"Unknown command \"); } if (stopChar > 0) { return(stopChar == '}' ? "Expected }, unbalanced braces" : $@"Expected {stopChar}"); } return(Ok(i)); } { if (BuildBreakList(globalAtoms, 0, false, '\0').Error is string error) { return(error); } } { if (CheckDollarCount(globalAtoms).Error is string error) { return(error); } } if (displayMath != null) { return("Math mode was not terminated"); } return(globalAtoms.Build()); }