public void ConsumeLetterThenDigitSequence() { CharacterReader r = new CharacterReader("One12 Two &bar; qux"); Assert.AreEqual("One12", r.ConsumeLetterThenDigitSequence()); Assert.AreEqual(' ', r.Consume()); Assert.AreEqual("Two", r.ConsumeLetterThenDigitSequence()); Assert.AreEqual(" &bar; qux", r.ConsumeToEnd()); }
public char?ConsumeCharacterReference(char?additionalAllowedCharacter, bool inAttribute) { if (_reader.IsEmpty()) { return(null); } if (additionalAllowedCharacter != null && additionalAllowedCharacter == _reader.Current()) { return(null); } if (_reader.MatchesAny('\t', '\n', '\r', '\f', ' ', '<', '&')) { return(null); } _reader.Mark(); if (_reader.MatchConsume("#")) { // numbered bool isHexMode = _reader.MatchConsumeIgnoreCase("X"); string numRef = isHexMode ? _reader.ConsumeHexSequence() : _reader.ConsumeDigitSequence(); if (numRef.Length == 0) { // didn't match anything CharacterReferenceError("Numeric reference with no numerals"); _reader.RewindToMark(); return(null); } if (!_reader.MatchConsume(";")) { CharacterReferenceError("Missing semicolon"); // missing semi } int charval = -1; try { int numbase = isHexMode ? 16 : 10; charval = Convert.ToInt32(numRef, numbase); } catch (FormatException) { } // skip if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) || charval > 0x10FFFF) { CharacterReferenceError("Character outside of valid range"); return(ReplacementChar); } else { // todo: implement number replacement table // todo: check for extra illegal unicode points as parse errors return((char)charval); } } else { // named // get as many letters as possible, and look for matching entities. unconsume backwards till a match is found string nameRef = _reader.ConsumeLetterThenDigitSequence(); bool looksLegit = _reader.Matches(';'); // found if a base named entity without a ;, or an extended entity with the ;. bool found = (Entities.IsBaseNamedEntity(nameRef) || (Entities.IsNamedEntity(nameRef) && looksLegit)); if (!found) { _reader.RewindToMark(); if (looksLegit) { CharacterReferenceError(string.Format("Invalid named referenece '{0}'", nameRef)); } return(null); } if (inAttribute && (_reader.MatchesLetter() || _reader.MatchesDigit() || _reader.MatchesAny('=', '-', '_'))) { // don't want that to match _reader.RewindToMark(); return(null); } if (!_reader.MatchConsume(";")) { CharacterReferenceError("Missing semicolon"); // missing semi } return(Entities.GetCharacterByName(nameRef)); } }