public void Parse_TextEmphasisAndBold_Successfully() { var text = "_the wrong_ barking up __tree__"; var tokens = tokenizer.Tokenize(text); var textNode = new TextNode(); textNode.Add(new WordNode(WordType.SpacedWord, " barking")); textNode.Add(new WordNode(WordType.SpacedWord, " up")); textNode.Add(new WordNode(WordType.Space, " ")); var emphasisNode = new TextNode(TextType.Emphasis); emphasisNode.Add(new WordNode(WordType.SimpleWord, "the")); emphasisNode.Add(new WordNode(WordType.SpacedWord, " wrong")); var boldNode = new TextNode(TextType.Bold); boldNode.Add(new WordNode(WordType.SimpleWord, "tree")); var expected = new SentenceNode(); expected.Add(emphasisNode); expected.Add(textNode); expected.Add(boldNode); Parser.ParseSentence(tokens) .Should() .BeEquivalentTo(expected); }
public void Parse_TextAndEmphasis_Successfully() { var text = "break the ice _go for broke_"; var tokens = tokenizer.Tokenize(text); var textNode = new TextNode(); textNode.Add(new WordNode(WordType.SimpleWord, "break")); textNode.Add(new WordNode(WordType.SpacedWord, " the")); textNode.Add(new WordNode(WordType.SpacedWord, " ice")); textNode.Add(new WordNode(WordType.Space, " ")); var emphasisNode = new TextNode(TextType.Emphasis); emphasisNode.Add(new WordNode(WordType.SimpleWord, "go")); emphasisNode.Add(new WordNode(WordType.SpacedWord, " for")); emphasisNode.Add(new WordNode(WordType.SpacedWord, " broke")); var expected = new SentenceNode(); expected.Add(textNode); expected.Add(emphasisNode); Parser.ParseSentence(tokens) .Should() .BeEquivalentTo(expected); }
public void Parse_TwoWords_Successfully(string text, WordType firstType, WordType secondType, string firstExpectedValue, string secondExpectedValue) { var tokens = tokenizer.Tokenize(text); var expected = new TextNode(); expected.Add(new WordNode(firstType, firstExpectedValue)); expected.Add(new WordNode(secondType, secondExpectedValue)); Parser.ParseText(tokens) .Should() .BeEquivalentTo(expected); }
public void Parse_IfAfterOpenUnderscoreFollowNonWhitespace_AsText() { var text = "_ science"; var tokens = tokenizer.Tokenize(text); var expected = new TextNode(); expected.Add(new WordNode(WordType.SimpleWord, "_")); expected.Add(new WordNode(WordType.SpacedWord, " science")); Parser.ParseEmphasisText(tokens) .Should() .BeEquivalentTo(expected); }
public void Parse_WordAndSpace_Successfully(string word, WordType wordType) { var text = word + " "; var tokens = tokenizer.Tokenize(text); var expected = new TextNode(); expected.Add(new WordNode(wordType, word)); expected.Add(new WordNode(WordType.Space, " ")); Parser.ParseText(tokens) .Should() .BeEquivalentTo(expected); }
public void Parse_IfBeforeCloseUnderscoreNonWhitespace_AsText() { var text = "_norm _"; var tokens = tokenizer.Tokenize(text); var expected = new TextNode(); expected.Add(new WordNode(WordType.SimpleWord, "_")); expected.Add(new WordNode(WordType.SimpleWord, "norm")); expected.Add(new WordNode(WordType.Space, " ")); Parser.ParseEmphasisText(tokens) .Should() .BeEquivalentTo(expected); }
public void Parse_NotPairedUnderscores_AsText() { var text = "_treatment__"; var tokens = tokenizer.Tokenize(text); var expected = new TextNode(); expected.Add(new WordNode(WordType.SimpleWord, "_")); expected.Add(new WordNode(WordType.SimpleWord, "treatment")); expected.Add(new WordNode(WordType.SimpleWord, "__")); Parser.ParseEmphasisText(tokens) .Should() .BeEquivalentTo(expected); }
public static TextNode ParseText(Deque <Token> tokens) { var plainText = new TextNode(); while (true) { if (tokens.Count == 0) { break; } var currentToken = tokens.PeekFirst(); if (currentToken.Type == TokenType.EscapeChar) { throw new NotSupportedException(); } var type = currentToken.Type; if (type != TokenType.Space && type != TokenType.Text && type != TokenType.Num) { break; } var newWord = currentToken.Type == TokenType.Space ? ParseSpacedWord(tokens) : ParseSimpleWord(tokens); plainText.Add(newWord); } return(plainText); }
public void Parse_TextSurroundedByUnderscores_Successfully() { var text = "_what am i, chopped liver?_"; var tokens = tokenizer.Tokenize(text); var expected = new TextNode(TextType.Emphasis); expected.Add(new WordNode(WordType.SimpleWord, "what")); expected.Add(new WordNode(WordType.SpacedWord, " am")); expected.Add(new WordNode(WordType.SpacedWord, " i,")); expected.Add(new WordNode(WordType.SpacedWord, " chopped")); expected.Add(new WordNode(WordType.SpacedWord, " liver?")); Parser.ParseEmphasisText(tokens) .Should() .BeEquivalentTo(expected); }
public void Parse_SimpleText_Successfully() { var text = "hard pill to swallow"; var tokens = tokenizer.Tokenize(text); var textNode = new TextNode(); textNode.Add(new WordNode(WordType.SimpleWord, "hard")); textNode.Add(new WordNode(WordType.SpacedWord, " pill")); textNode.Add(new WordNode(WordType.SpacedWord, " to")); textNode.Add(new WordNode(WordType.SpacedWord, " swallow")); var expected = new SentenceNode(); expected.Add(textNode); Parser.ParseSentence(tokens) .Should() .BeEquivalentTo(expected); }
public static TextNode ParseEmphasisText(Deque <Token> tokens) { var underscore = tokens.PopFirst(); if (tokens.Count == 0) { return(ParseText(new Deque <Token> { new Token(TokenType.Text, underscore.Value) })); } if (tokens.PeekFirst().Type == TokenType.Space) { tokens.Insert(0, new Token(TokenType.Text, underscore.Value)); return(ParseText(tokens)); } var emphasisText = new TextNode(TextType.Emphasis); while (true) { if (tokens.Count == 0) { emphasisText.Words.Insert(0, new WordNode(WordType.SimpleWord, underscore.Value)); return(new TextNode(TextType.Text, emphasisText.Words)); } var currentToken = tokens.PeekFirst(); switch (currentToken.Type) { case TokenType.Underscore: tokens.PopFirst(); if (emphasisText.Words.Last().Type == WordType.Space) { emphasisText.Words.Insert(0, new WordNode(WordType.SimpleWord, underscore.Value)); return(new TextNode(TextType.Text, emphasisText.Words)); } return(emphasisText); case TokenType.DoubleUnderscore: tokens.PopFirst(); var doubleUnderscore = new WordNode(WordType.SimpleWord, underscore.Value + underscore.Value); emphasisText.Add(doubleUnderscore); break; default: var innerText = ParseText(tokens); var innerWords = innerText.Words; emphasisText.AddRange(innerWords); break; } } }
public void Parse_SingleTextStartingWithSpace_Successfully() { var text = " an arm and a leg"; var tokens = tokenizer.Tokenize(text); var textNode = new TextNode(); textNode.Add(new WordNode(WordType.SpacedWord, " an")); textNode.Add(new WordNode(WordType.SpacedWord, " arm")); textNode.Add(new WordNode(WordType.SpacedWord, " and")); textNode.Add(new WordNode(WordType.SpacedWord, " a")); textNode.Add(new WordNode(WordType.SpacedWord, " leg")); var expected = new SentenceNode(); expected.Add(textNode); Parser.ParseSentence(tokens) .Should() .BeEquivalentTo(expected); }
public void Parse_SingleDoubleUnderscore_AsText() { var text = "__"; var tokens = tokenizer.Tokenize(text); var expected = new TextNode(); expected.Add(new WordNode(WordType.SimpleWord, "__")); Parser.ParseBoldText(tokens) .Should() .BeEquivalentTo(expected); }
public void Parse_SingleWord_Successfully(string text, WordType type, string value) { var tokens = tokenizer.Tokenize(text); var expected = new TextNode(); expected.Add(new WordNode(type, value)); Parser.ParseText(tokens) .Should() .BeEquivalentTo(expected); }
public void Parse_EmphasisTextInside_Successfully() { var text = "__a chip on _your_ shoulder__"; var tokens = tokenizer.Tokenize(text); var expected = new TextNode(TextType.Bold); expected.Add(new WordNode(WordType.SimpleWord, "a")); expected.Add(new WordNode(WordType.SpacedWord, " chip")); expected.Add(new WordNode(WordType.SpacedWord, " on")); expected.Add(new WordNode(WordType.Space, " ")); expected.Add(new WordNode(WordType.SimpleWord, "<em>")); expected.Add(new WordNode(WordType.SimpleWord, "your")); expected.Add(new WordNode(WordType.SimpleWord, "</em>")); expected.Add(new WordNode(WordType.SpacedWord, " shoulder")); Parser.ParseBoldText(tokens) .Should() .BeEquivalentTo(expected); }
public void Parse_DoubleUnderscoresInside_AsText() { var text = "_easy __as__ pie_"; var tokens = tokenizer.Tokenize(text); var expected = new TextNode(TextType.Emphasis); expected.Add(new WordNode(WordType.SimpleWord, "easy")); expected.Add(new WordNode(WordType.Space, " ")); expected.Add(new WordNode(WordType.SimpleWord, "__")); expected.Add(new WordNode(WordType.SimpleWord, "as")); expected.Add(new WordNode(WordType.SimpleWord, "__")); expected.Add(new WordNode(WordType.SpacedWord, " pie")); Parser.ParseEmphasisText(tokens) .Should() .BeEquivalentTo(expected); }
public void Parse_SingleTextStartingAndEndingWithSpace_Successfully() { var text = " everything but the kitchen sink "; var tokens = tokenizer.Tokenize(text); var textNode = new TextNode(); textNode.Add(new WordNode(WordType.SpacedWord, " everything")); textNode.Add(new WordNode(WordType.SpacedWord, " but")); textNode.Add(new WordNode(WordType.SpacedWord, " the")); textNode.Add(new WordNode(WordType.SpacedWord, " kitchen")); textNode.Add(new WordNode(WordType.SpacedWord, " sink")); textNode.Add(new WordNode(WordType.Space, " ")); var expected = new SentenceNode(); expected.Add(textNode); Parser.ParseSentence(tokens) .Should() .BeEquivalentTo(expected); }
public void Parse_SingleTextEndingWithSpace_Successfully() { var text = "barking up the wrong tree "; var tokens = tokenizer.Tokenize(text); var textNode = new TextNode(); textNode.Add(new WordNode(WordType.SimpleWord, "barking")); textNode.Add(new WordNode(WordType.SpacedWord, " up")); textNode.Add(new WordNode(WordType.SpacedWord, " the")); textNode.Add(new WordNode(WordType.SpacedWord, " wrong")); textNode.Add(new WordNode(WordType.SpacedWord, " tree")); textNode.Add(new WordNode(WordType.Space, " ")); var expected = new SentenceNode(); expected.Add(textNode); Parser.ParseSentence(tokens) .Should() .BeEquivalentTo(expected); }
public static void WriteWordLookUp(EpsgData data, BinaryWriter textWriter, BinaryWriter indexWriter) { var roots = new List<TextNode>(); foreach(var text in data.WordLookUpList) { var containerRoot = TextNode.FindContainingRoot(roots, text); if(null == containerRoot) { containerRoot = new TextNode(text); var containedRoots = roots.Where(r => containerRoot.Contains(r.Text)).ToList(); foreach(var containedRoot in containedRoots) { roots.Remove(containedRoot); if(!containerRoot.Add(containedRoot)) { throw new InvalidOperationException(); } } roots.Add(containerRoot); }else { if(!containerRoot.Add(text)) { throw new InvalidOperationException(); } } } for (int quality = Math.Min(6,roots.Select(x => x.Text.Length).Max()/2); quality >= 0; quality--) { for (int i = 0; i < roots.Count; i++) { for (int j = i + 1; j < roots.Count; j++) { int overlapAt = StringUtils.OverlapIndex(roots[i].Text, roots[j].Text); if (overlapAt >= 0 && (roots[i].Text.Length - overlapAt) >= quality) { var newText = roots[i].Text.Substring(0, overlapAt) + roots[j].Text; var newNode = new TextNode(newText, new[]{roots[i], roots[j]}); roots.RemoveAt(j); roots[i] = newNode; i--; break; } overlapAt = StringUtils.OverlapIndex(roots[j].Text, roots[i].Text); if (overlapAt >= 0 && (roots[j].Text.Length - overlapAt) >= quality) { var newText = roots[j].Text.Substring(0, overlapAt) + roots[i].Text; var newNode = new TextNode(newText, new[]{roots[j], roots[i]}); roots.RemoveAt(j); roots[i] = newNode; i--; break; } } } } var offsetLookUp = new Dictionary<string, int>(); int rootOffset = 0; foreach(var root in roots) { var rootText = root.Text; var rootBytes = Encoding.UTF8.GetBytes(rootText); textWriter.Write(rootBytes); foreach(var text in root.GetAllString()) { int startIndex = rootText.IndexOf(text, StringComparison.Ordinal); var localOffset = Encoding.UTF8.GetByteCount(rootText.Substring(0, startIndex)); offsetLookUp.Add(text, rootOffset + localOffset); } rootOffset += rootBytes.Length; } foreach(var word in data.WordLookUpList) { indexWriter.Write((ushort)offsetLookUp[word]); indexWriter.Write((byte)(Encoding.UTF8.GetByteCount(word))); } }
public static TextNode ParseBoldText(Deque <Token> tokens) { var doubleUnderscore = tokens.PopFirst(); if (tokens.Count == 0) { return(ParseText(new Deque <Token> { new Token(TokenType.Text, doubleUnderscore.Value) })); } if (tokens.PeekFirst().Type == TokenType.Space) { tokens.Insert(0, new Token(TokenType.Text, doubleUnderscore.Value)); return(ParseText(tokens)); } var boldText = new TextNode(TextType.Bold); while (true) { if (tokens.Count == 0) { boldText.Words.Insert(0, new WordNode(WordType.SimpleWord, doubleUnderscore.Value)); return(new TextNode(TextType.Text, boldText.Words)); } var currentToken = tokens.PeekFirst(); switch (currentToken.Type) { case TokenType.DoubleUnderscore: tokens.PopFirst(); if (boldText.Words.Last().Type == WordType.Space) { boldText.Words.Insert(0, new WordNode(WordType.SimpleWord, doubleUnderscore.Value)); return(new TextNode(TextType.Text, boldText.Words)); } return(boldText); case TokenType.Underscore: var innerEmphasisText = ParseEmphasisText(tokens); if (innerEmphasisText.Type == TextType.Text) { boldText.AddRange(innerEmphasisText.Words); continue; } var openEmTag = new WordNode(WordType.SimpleWord, "<em>"); var closeEmTag = new WordNode(WordType.SimpleWord, "</em>"); boldText.Add(openEmTag); boldText.AddRange(innerEmphasisText.Words); boldText.Add(closeEmTag); break; default: var innerText = ParseText(tokens); var innerWords = innerText.Words; boldText.AddRange(innerWords); break; } } }
public static void WriteWordLookUp(EpsgData data, BinaryWriter textWriter, BinaryWriter indexWriter) { var roots = new List <TextNode>(); foreach (var text in data.WordLookUpList) { var containerRoot = TextNode.FindContainingRoot(roots, text); if (null == containerRoot) { containerRoot = new TextNode(text); var containedRoots = roots.Where(r => containerRoot.Contains(r.Text)).ToList(); foreach (var containedRoot in containedRoots) { roots.Remove(containedRoot); if (!containerRoot.Add(containedRoot)) { throw new InvalidOperationException(); } } roots.Add(containerRoot); } else { if (!containerRoot.Add(text)) { throw new InvalidOperationException(); } } } for (int quality = Math.Min(6, roots.Select(x => x.Text.Length).Max() / 2); quality >= 0; quality--) { for (int i = 0; i < roots.Count; i++) { for (int j = i + 1; j < roots.Count; j++) { int overlapAt = StringUtils.OverlapIndex(roots[i].Text, roots[j].Text); if (overlapAt >= 0 && (roots[i].Text.Length - overlapAt) >= quality) { var newText = roots[i].Text.Substring(0, overlapAt) + roots[j].Text; var newNode = new TextNode(newText, new[] { roots[i], roots[j] }); roots.RemoveAt(j); roots[i] = newNode; i--; break; } overlapAt = StringUtils.OverlapIndex(roots[j].Text, roots[i].Text); if (overlapAt >= 0 && (roots[j].Text.Length - overlapAt) >= quality) { var newText = roots[j].Text.Substring(0, overlapAt) + roots[i].Text; var newNode = new TextNode(newText, new[] { roots[j], roots[i] }); roots.RemoveAt(j); roots[i] = newNode; i--; break; } } } } var offsetLookUp = new Dictionary <string, int>(); int rootOffset = 0; foreach (var root in roots) { var rootText = root.Text; var rootBytes = Encoding.UTF8.GetBytes(rootText); textWriter.Write(rootBytes); foreach (var text in root.GetAllString()) { int startIndex = rootText.IndexOf(text, StringComparison.Ordinal); var localOffset = Encoding.UTF8.GetByteCount(rootText.Substring(0, startIndex)); offsetLookUp.Add(text, rootOffset + localOffset); } rootOffset += rootBytes.Length; } foreach (var word in data.WordLookUpList) { indexWriter.Write((ushort)offsetLookUp[word]); indexWriter.Write((byte)(Encoding.UTF8.GetByteCount(word))); } }