public static bool TryParseLabel <T>(ref T lines, bool allowEmpty, out string label, out SourceSpan labelSpan) where T : ICharIterator { label = null; char c = lines.CurrentChar; labelSpan = SourceSpan.Empty; if (c != '[') { return(false); } var buffer = StringBuilderCache.Local(); var startLabel = -1; var endLabel = -1; bool hasEscape = false; bool previousWhitespace = true; bool hasNonWhiteSpace = false; bool isValid = false; while (true) { c = lines.NextChar(); if (c == '\0') { break; } if (hasEscape) { if (c != '[' && c != ']' && c != '\\') { break; } } else { if (c == '[') { break; } if (c == ']') { lines.NextChar(); // Skip ] if (allowEmpty || hasNonWhiteSpace) { // Remove trailing spaces for (int i = buffer.Length - 1; i >= 0; i--) { if (!buffer[i].IsWhitespace()) { break; } buffer.Length = i; endLabel--; } // Only valid if buffer is less than 1000 characters if (buffer.Length <= 999) { labelSpan.Start = startLabel; labelSpan.End = endLabel; if (labelSpan.Start > labelSpan.End) { labelSpan = SourceSpan.Empty; } label = buffer.ToString(); isValid = true; } } break; } } var isWhitespace = c.IsWhitespace(); if (isWhitespace) { // Replace any whitespace by a single ' ' c = ' '; } if (!hasEscape && c == '\\') { if (startLabel < 0) { startLabel = lines.Start; } hasEscape = true; } else { hasEscape = false; if (!previousWhitespace || !isWhitespace) { if (startLabel < 0) { startLabel = lines.Start; } endLabel = lines.Start; buffer.Append(c); if (!isWhitespace) { hasNonWhiteSpace = true; } } } previousWhitespace = isWhitespace; } buffer.Length = 0; return(isValid); }
public static bool TryParseUrl <T>(ref T text, out string link) where T : ICharIterator { bool isValid = false; var buffer = StringBuilderCache.Local(); buffer.Length = 0; var c = text.CurrentChar; // a sequence of zero or more characters between an opening < and a closing > // that contains no spaces, line breaks, or unescaped < or > characters, or if (c == '<') { bool hasEscape = false; do { c = text.NextChar(); if (!hasEscape && c == '>') { text.NextChar(); isValid = true; break; } if (!hasEscape && c == '<') { break; } if (hasEscape && !c.IsAsciiPunctuation()) { buffer.Append('\\'); } if (c == '\\') { hasEscape = true; continue; } hasEscape = false; if (c.IsWhitespace()) // TODO: specs unclear. space is strict or relaxed? (includes tabs?) { break; } buffer.Append(c); } while (c != '\0'); } else { // a nonempty sequence of characters that does not include ASCII space or control characters, // and includes parentheses only if (a) they are backslash-escaped or (b) they are part of a // balanced pair of unescaped parentheses that is not itself inside a balanced pair of unescaped // parentheses. bool hasEscape = false; int openedParent = 0; while (true) { // Match opening and closing parenthesis if (c == '(') { if (!hasEscape) { if (openedParent > 0) { break; } openedParent++; } } if (c == ')') { if (!hasEscape) { openedParent--; if (openedParent < 0) { isValid = true; break; } } } if (hasEscape && !c.IsAsciiPunctuation()) { buffer.Append('\\'); } // If we have an escape if (c == '\\') { hasEscape = true; c = text.NextChar(); continue; } hasEscape = false; if (c == '\0' || c.IsSpaceOrTab() || c.IsControl()) // TODO: specs unclear. space is strict or relaxed? (includes tabs?) { isValid = true; break; } buffer.Append(c); c = text.NextChar(); } } link = isValid ? buffer.ToString() : null; buffer.Length = 0; return(isValid); }
public static bool TryParseTitle <T>(ref T text, out string title) where T : ICharIterator { bool isValid = false; var buffer = StringBuilderCache.Local(); buffer.Length = 0; // a sequence of zero or more characters between straight double-quote characters ("), including a " character only if it is backslash-escaped, or // a sequence of zero or more characters between straight single-quote characters ('), including a ' character only if it is backslash-escaped, or var c = text.CurrentChar; if (c == '\'' || c == '"' || c == '(') { var closingQuote = c == '(' ? ')' : c; bool hasEscape = false; // -1: undefined // 0: has only spaces // 1: has other characters int hasOnlyWhiteSpacesSinceLastLine = -1; while (true) { c = text.NextChar(); if (c == '\n') { if (hasOnlyWhiteSpacesSinceLastLine >= 0) { if (hasOnlyWhiteSpacesSinceLastLine == 1) { break; } hasOnlyWhiteSpacesSinceLastLine = -1; } } if (c == '\0') { break; } if (c == closingQuote) { if (hasEscape) { buffer.Append(closingQuote); hasEscape = false; continue; } // Skip last quote text.NextChar(); isValid = true; break; } if (hasEscape && !c.IsAsciiPunctuation()) { buffer.Append('\\'); } if (c == '\\') { hasEscape = true; continue; } hasEscape = false; if (c.IsSpaceOrTab()) { if (hasOnlyWhiteSpacesSinceLastLine < 0) { hasOnlyWhiteSpacesSinceLastLine = 1; } } else if (c != '\n') { hasOnlyWhiteSpacesSinceLastLine = 0; } buffer.Append(c); } } title = isValid ? buffer.ToString() : null; buffer.Length = 0; return(isValid); }
public static string Urilize(string headingText, bool allowOnlyAscii) { #if SUPPORT_NORMALIZE // Normalzie the string if we don't allow UTF8 if (allowOnlyAscii) { headingText = headingText.Normalize(NormalizationForm.FormD); } #endif var headingBuffer = StringBuilderCache.Local(); bool hasLetter = false; bool previousIsSpace = false; for (int i = 0; i < headingText.Length; i++) { var c = headingText[i]; if (char.IsLetter(c)) { #if SUPPORT_NORMALIZE if (allowOnlyAscii && (c < ' ' || c >= 127)) { continue; } #endif c = char.IsUpper(c) ? char.ToLowerInvariant(c) : c; headingBuffer.Append(c); hasLetter = true; previousIsSpace = false; } else if (hasLetter) { if (IsReservedPunctuation(c)) { if (previousIsSpace) { headingBuffer.Length--; } if (headingBuffer[headingBuffer.Length - 1] != c) { headingBuffer.Append(c); } previousIsSpace = false; } else if (c.IsDigit()) { headingBuffer.Append(c); previousIsSpace = false; } else if (!previousIsSpace && c.IsWhitespace()) { var pc = headingBuffer[headingBuffer.Length - 1]; if (!IsReservedPunctuation(pc)) { headingBuffer.Append('-'); } previousIsSpace = true; } } } // Trim trailing _ - . while (headingBuffer.Length > 0) { var c = headingBuffer[headingBuffer.Length - 1]; if (IsReservedPunctuation(c)) { headingBuffer.Length--; } else { break; } } var text = headingBuffer.ToString(); headingBuffer.Length = 0; return(text); }
public static bool TryParseAutolink(ref StringSlice text, out string link, out bool isEmail) { link = null; isEmail = false; var c = text.CurrentChar; if (c != '<') { return(false); } // An absolute URI, for these purposes, consists of a scheme followed by a colon (:) // followed by zero or more characters other than ASCII whitespace and control characters, <, and >. // If the URI includes these characters, they must be percent-encoded (e.g. %20 for a space). // a scheme is any sequence of 2–32 characters // beginning with an ASCII letter // and followed by any combination of ASCII letters, digits, or the symbols plus (”+”), period (”.”), or hyphen (”-”). // An email address, for these purposes, is anything that matches the non-normative regex from the HTML5 spec: // /^ // [a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+ // @ // [a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])? // (?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$/ c = text.NextChar(); // -1: scan email // 0: scan uri or email // +1: scan uri int state = 0; if (!c.IsAlpha()) { // We may have an email char? if (c.IsDigit() || CharHelper.IsEmailUsernameSpecialChar(c)) { state = -1; } else { return(false); } } var builder = StringBuilderCache.Local(); // **************************** // 1. Scan scheme or user email // **************************** builder.Append(c); while (true) { c = text.NextChar(); // Chars valid for both scheme and email var isSpecialChar = c == '+' || c == '.' || c == '-'; var isValidChar = c.IsAlphaNumeric() || isSpecialChar; if (state <= 0 && CharHelper.IsEmailUsernameSpecialChar(c)) { isValidChar = true; // If this is not a special char valid also for url scheme, then we have an email if (!isSpecialChar) { state = -1; } } if (isValidChar) { // a scheme is any sequence of 2–32 characters if (state > 0 && builder.Length >= 32) { builder.Length = 0; return(false); } builder.Append(c); } else if (c == ':') { if (state < 0 || builder.Length <= 2) { builder.Length = 0; return(false); } state = 1; break; } else if (c == '@') { if (state > 0) { builder.Length = 0; return(false); } state = -1; break; } else { builder.Length = 0; return(false); } } // append ':' or '@' builder.Append(c); if (state < 0) { isEmail = true; // scan an email // [a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])? // (?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$/ bool hasMinus = false; int domainCharCount = 0; char pc = '\0'; while (true) { c = text.NextChar(); if (c == '>') { if (domainCharCount == 0 || hasMinus) { break; } text.NextChar(); link = builder.ToString(); builder.Length = 0; return(true); } if (c.IsAlphaNumeric() || (domainCharCount > 0 && (hasMinus = c == '-'))) { domainCharCount++; if (domainCharCount > 63) { break; } } else if (c == '.') { if (pc == '.' || pc == '-') { break; } domainCharCount = 0; hasMinus = false; } else { break; } builder.Append(c); } } else { // scan an uri // An absolute URI, for these purposes, consists of a scheme followed by a colon (:) // followed by zero or more characters other than ASCII whitespace and control characters, <, and >. // If the URI includes these characters, they must be percent-encoded (e.g. %20 for a space). while (true) { c = text.NextChar(); if (c == '\0') { break; } if (c == '>') { text.NextChar(); link = builder.ToString(); builder.Length = 0; return(true); } // Chars valid for both scheme and email if (c > ' ' && c < 127 && c != '<') { builder.Append(c); } else { break; } } } builder.Length = 0; return(false); }
/// <summary> /// Destructively unescape a string: remove backslashes before punctuation or symbol characters. /// </summary> /// <param name="text">The string data that will be changed by unescaping any punctuation or symbol characters.</param> /// <param name="removeBackSlash">if set to <c>true</c> [remove back slash].</param> /// <returns></returns> public static string Unescape(string text, bool removeBackSlash = true) { // Credits: code from CommonMark.NET // Copyright (c) 2014, Kārlis Gaņģis All rights reserved. // See license for details: https://github.com/Knagis/CommonMark.NET/blob/master/LICENSE.md if (string.IsNullOrEmpty(text)) { return(string.Empty); } // remove backslashes before punctuation chars: int searchPos = 0; int lastPos = 0; char c; char[] search = removeBackSlash ? SearchBackAndAmp : SearchAmp; StringBuilder sb = null; while ((searchPos = text.IndexOfAny(search, searchPos)) != -1) { sb ??= StringBuilderCache.Local(); c = text[searchPos]; if (removeBackSlash && c == '\\') { searchPos++; if (text.Length == searchPos) { break; } c = text[searchPos]; if (c.IsEscapableSymbol()) { sb.Append(text, lastPos, searchPos - lastPos - 1); lastPos = searchPos; } } else if (c == '&') { var match = ScanEntity(new StringSlice(text, searchPos, text.Length - 1), out int numericEntity, out int entityNameStart, out int entityNameLength); if (match == 0) { searchPos++; } else { searchPos += match; if (entityNameLength > 0) { var decoded = EntityHelper.DecodeEntity(text.AsSpan(entityNameStart, entityNameLength)); if (decoded != null) { sb.Append(text, lastPos, searchPos - match - lastPos); sb.Append(decoded); lastPos = searchPos; } } else if (numericEntity >= 0) { sb.Append(text, lastPos, searchPos - match - lastPos); EntityHelper.DecodeEntity(numericEntity, sb); lastPos = searchPos; } } } } if (sb == null || lastPos == 0) { return(text); } sb.Append(text, lastPos, text.Length - lastPos); return(sb.GetStringAndReset()); }