public void UnorderedListWithMultipleItemsWithoutFormattingIsTransformedIntoGroupWithManyUnits() { var xliff = @"<?xml version=""1.0"" encoding=""utf-8""?> <xliff srcLang=""en-GB"" version=""2.0"" xmlns=""urn:oasis:names:tc:xliff:document:2.0""> <file id=""f1""> <unit id=""u1"" name=""original""> <segment> <source><![CDATA[<ul><li>Hello Word1!</li><li>Hello Word2!</li><li>Hello Word3!</li></ul>]]></source> </segment> </unit> </file> </xliff>"; XliffDocument document = LoadXliff(xliff); var splitter = new ParagraphSplitter(); var newDocument = splitter.ExecuteExtraction(document); Assert.AreEqual(1, newDocument.Files[0].Containers.Count); var group = newDocument.Files[0].Containers[0] as Group; Assert.IsNotNull(group); Assert.AreEqual("u1-g", group.Id); Assert.AreEqual("original", group.Name); Assert.AreEqual(1, group.Containers.Count); var ulGroup = group.Containers[0] as Group; Assert.AreEqual("ul", ulGroup.Name); Assert.AreEqual(3, ulGroup.Containers.Count); var unit1 = ulGroup.Containers[0] as Unit; var textUnit1 = unit1.Resources[0].Source.Text[0].ToString(); Assert.AreEqual("li", unit1.Name); Assert.AreEqual("Hello Word1!", textUnit1); var unit2 = ulGroup.Containers[1] as Unit; var textUnit2 = unit2.Resources[0].Source.Text[0].ToString(); Assert.AreEqual("li", unit2.Name); Assert.AreEqual("Hello Word2!", textUnit2); var unit3 = ulGroup.Containers[2] as Unit; var textUnit3 = unit3.Resources[0].Source.Text[0].ToString(); Assert.AreEqual("li", unit3.Name); Assert.AreEqual("Hello Word3!", textUnit3); }
public void OrderedListAsChildWithTwoItemIsWrappedInUL() { var xliff = @"<?xml version=""1.0"" encoding=""utf-8""?> <xliff srcLang=""en-GB"" trgLang=""it-IT"" version=""2.0"" xmlns=""urn:oasis:names:tc:xliff:document:2.0""> <file id=""f1""> <group id=""u1-g"" name=""original""> <unit id=""u1-1"" name=""p"" > <segment> <source>whatever</source> <target>whatever</target> </segment> </unit> <group id=""u1-2-g"" name=""ul"" > <unit id= ""u1-2-1"" name=""li""> <segment> <source>Item 1</source> <target>Item 1</target> </segment> </unit> <unit id= ""u1-2-2"" name=""li"" > <segment> <source>Item 1</source> <target>Item 1</target> </segment> </unit> </group> </group> </file> </xliff>"; XliffDocument document = LoadXliff(xliff); var splitter = new ParagraphSplitter(); var newDocument = splitter.ExecuteMerge(document); Assert.AreEqual(1, newDocument.Files[0].Containers.Count); var unit = newDocument.Files[0].Containers[0] as Unit; Assert.IsNotNull(unit); Assert.AreEqual("u1", unit.Id); Assert.AreEqual("original", unit.Name); Assert.AreEqual(1, unit.Resources[0].Target.Text.Count); var text = unit.Resources[0].Target.Text[0] as CDataTag; Assert.IsNotNull(text); Assert.AreEqual("<p>whatever</p><ul><li>Item 1</li><li>Item 1</li></ul>", text.Text); }
public void MultipleParagraphsUnitIsTransformedIntoGroupWithManyUnits() { var xliff = @"<?xml version=""1.0"" encoding=""utf-8""?> <xliff srcLang=""en-GB"" version=""2.0"" xmlns=""urn:oasis:names:tc:xliff:document:2.0""> <file id=""f1""> <unit id=""u1""> <segment> <source><![CDATA[<p>Hello Word1!</p><p>Hello Word2!</p><p>Hello <b>Word3</b>!</p>]]></source> </segment> </unit> </file> </xliff>"; XliffDocument document = LoadXliff(xliff); var splitter = new ParagraphSplitter(_htmlParser); var newDocument = splitter.ExecuteExtraction(document); Assert.AreEqual(1, newDocument.Files[0].Containers.Count); var group = newDocument.Files[0].Containers[0] as Group; Assert.IsNotNull(group); Assert.AreEqual("u1-g", group.Id); Assert.AreEqual(3, group.Containers.Count); var unit1 = group.Containers[0] as Unit; var textUnit1 = unit1.Resources[0].Source.Text[0].ToString(); Assert.AreEqual("html:p", unit1.Type); Assert.AreEqual("Hello Word1!", textUnit1); var unit2 = group.Containers[1] as Unit; var textUnit2 = unit2.Resources[0].Source.Text[0].ToString(); Assert.AreEqual("html:p", unit2.Type); Assert.AreEqual("Hello Word2!", textUnit2); var unit3 = group.Containers[2] as Unit; var textUnit3 = unit3.Resources[0].Source.Text[0].ToString(); Assert.AreEqual("html:p", unit3.Type); Assert.AreEqual("<![CDATA[Hello <b>Word3</b>!]]>", textUnit3); }
public void OneGroupWithMultipleParagraphWithNotFormattingCDataAreMergedBackIntoOneUnit() { var xliff = @"<?xml version=""1.0"" encoding=""utf-8""?> <xliff srcLang=""en-GB"" trgLang=""it-IT"" version=""2.0"" xmlns=""urn:oasis:names:tc:xliff:document:2.0""> <file id=""f1""> <group id=""u1-g"" name=""original""> <unit id=""u1-1"" name=""p""> <segment> <source>Hello Word1!</source> <target>Hello Word1!</target> </segment> </unit> <unit id=""u1-2"" name=""p""> <segment> <source>Hello Word2!</source> <target>Hello Word2!</target> </segment> </unit> <unit id=""u1-3"" name=""p""> <segment> <source>Hello Word3!</source> <target>Hello Word3!</target> </segment> </unit> </group> </file> </xliff>"; XliffDocument document = LoadXliff(xliff); var splitter = new ParagraphSplitter(); var newDocument = splitter.ExecuteMerge(document); Assert.AreEqual(1, newDocument.Files[0].Containers.Count); var unit = newDocument.Files[0].Containers[0] as Unit; Assert.IsNotNull(unit); Assert.AreEqual("u1", unit.Id); Assert.AreEqual("original", unit.Name); Assert.AreEqual(1, unit.Resources[0].Target.Text.Count); var cdata = unit.Resources[0].Target.Text[0] as CDataTag; Assert.IsNotNull(cdata); Assert.AreEqual("<p>Hello Word1!</p><p>Hello Word2!</p><p>Hello Word3!</p>", cdata.Text); }
public void OneGroupMultipleParagraphPlainTextAreMergedBackIntoOneUnit() { var xliff = @"<?xml version=""1.0"" encoding=""utf-8""?> <xliff srcLang=""en-GB"" trgLang=""it-IT"" version=""2.0"" xmlns=""urn:oasis:names:tc:xliff:document:2.0""> <file id=""f1""> <group id=""u1-g"" name=""original""> <unit id=""u1-1""> <segment> <source>Hello Word1!</source> <target>Hello Word1!</target> </segment> </unit> <unit id=""u1-2""> <segment> <source>Hello Word2!</source> <target>Hello Word2!</target> </segment> </unit> <unit id=""u1-3""> <segment> <source>Hello Word3!</source> <target>Hello Word3!</target> </segment> </unit> </group> </file> </xliff>"; XliffDocument document = LoadXliff(xliff); var splitter = new ParagraphSplitter(_htmlParser); var newDocument = splitter.ExecuteMerge(document); Assert.AreEqual(1, newDocument.Files[0].Containers.Count); var unit = newDocument.Files[0].Containers[0] as Unit; Assert.IsNotNull(unit); Assert.AreEqual("u1", unit.Id); Assert.AreEqual("original", unit.Name); Assert.AreEqual(1, unit.Resources[0].Target.Text.Count); var text = unit.Resources[0].Target.Text[0] as PlainText; Assert.IsNotNull(text); Assert.AreEqual("Hello Word1!" + Environment.NewLine + "Hello Word2!" + Environment.NewLine + "Hello Word3!", text.Text); }
public void MulipleParagraphPlainTextUnitIsSplit() { var xliff = @"<?xml version=""1.0"" encoding=""utf-8""?> <xliff srcLang=""en-GB"" version=""2.0"" xmlns=""urn:oasis:names:tc:xliff:document:2.0""> <file id=""f1""> <unit id=""u1"" name=""originaParagraph""> <segment> <source>Hello Word1! Hello Word2! Hello Word3!</source> </segment> </unit> </file> </xliff>"; XliffDocument document = LoadXliff(xliff); var splitter = new ParagraphSplitter(_htmlParser); var newDocument = splitter.ExecuteExtraction(document); Assert.AreEqual(1, newDocument.Files[0].Containers.Count); var group = newDocument.Files[0].Containers[0] as Group; Assert.IsNotNull(group); Assert.AreEqual("u1-g", group.Id); Assert.AreEqual("originaParagraph", group.Name); Assert.AreEqual(3, group.Containers.Count); var unit1 = group.Containers[0] as Unit; var textUnit1 = unit1.Resources[0].Source.Text[0].ToString(); Assert.AreEqual("Hello Word1!", textUnit1); var unit2 = group.Containers[1] as Unit; var textUnit2 = unit2.Resources[0].Source.Text[0].ToString(); Assert.AreEqual("Hello Word2!", textUnit2); var unit3 = group.Containers[2] as Unit; var textUnit3 = unit3.Resources[0].Source.Text[0].ToString(); Assert.AreEqual("Hello Word3!", textUnit3); }
public void ClassAttributeInsideNestedHtmlElementIsReadFromXliff() { var xliff = @"<?xml version=""1.0"" encoding=""utf-8""?> <xliff xmlns:mda=""urn:oasis:names:tc:xliff:metadata:2.0"" srcLang=""en-GB"" trgLang=""it-IT"" version=""2.0"" xmlns=""urn:oasis:names:tc:xliff:document:2.0""> <file id=""f1""> <group id=""u1-g"" name=""original""> <group id=""u1-1-g"" type=""html:ul""> <unit id=""u1-1-1"" type=""html:li""> <mda:metadata> <mda:metaGroup id=""originalAttributes""> <mda:meta type=""class"">even</mda:meta> </mda:metaGroup> </mda:metadata> <segment> <source>Hello Word1!</source> <target>Hello Word1!</target> </segment> </unit> </group> </group> </file> </xliff>"; XliffDocument document = LoadXliff(xliff); var splitter = new ParagraphSplitter(_htmlParser); var newDocument = splitter.ExecuteMerge(document); Assert.AreEqual(1, newDocument.Files[0].Containers.Count); var unit = newDocument.Files[0].Containers[0] as Unit; Assert.IsNotNull(unit); Assert.AreEqual("u1", unit.Id); Assert.AreEqual("original", unit.Name); Assert.AreEqual(1, unit.Resources[0].Target.Text.Count); var text = unit.Resources[0].Target.Text[0] as CDataTag; Assert.IsNotNull(text); Assert.AreEqual("<ul><li class=\"even\">Hello Word1!</li></ul>", text.Text); }
public void SingleParagraphPlainTextUnitIsNotSplit() { var xliff = @"<?xml version=""1.0"" encoding=""utf-8""?> <xliff srcLang=""en-GB"" version=""2.0"" xmlns=""urn:oasis:names:tc:xliff:document:2.0""> <file id=""f1""> <unit id=""u1""> <segment> <source>Hello Word!</source> </segment> </unit> </file> </xliff>"; XliffDocument document = LoadXliff(xliff); var splitter = new ParagraphSplitter(_htmlParser); var newDocument = splitter.ExecuteExtraction(document); Assert.AreEqual(1, newDocument.Files[0].Containers.Count); var unit = newDocument.Files[0].Containers[0] as Unit; Assert.IsNotNull(unit); }
public void TwoGroupsWithMultipleParagraphCDataAreMergedBackIntoOneUnit() { var xliff = @"<?xml version=""1.0"" encoding=""utf-8""?> <xliff srcLang=""en-GB"" trgLang=""it-IT"" version=""2.0"" xmlns=""urn:oasis:names:tc:xliff:document:2.0""> <file id=""f1""> <group id=""u1-g"" name=""original1""> <unit id=""u1-1"" name=""p""> <segment> <source>Hello Word1!</source> <target>Hello Word1!</target> </segment> </unit> <unit id=""u1-2"" name=""p""> <segment> <source>Hello Word2!</source> <target>Hello Word2!</target> </segment> </unit> <unit id=""u1-3"" name=""p""> <segment> <source><![CDATA[Hello <b>Word3</b>!]]></source> <target><![CDATA[Hello <b>Word3</b>!]]></target> </segment> </unit> </group> <group id=""u2-g"" name=""original2""> <unit id=""u2-1"" name=""p""> <segment> <source>Hello2 Word1!</source> <target>Hello2 Word1!</target> </segment> </unit> <unit id=""u2-2"" name=""p""> <segment> <source>Hello2 Word2!</source> <target>Hello2 Word2!</target> </segment> </unit> <unit id=""u2-3"" name=""p""> <segment> <source><![CDATA[Hello2 <b>Word3</b>!]]></source> <target><![CDATA[Hello2 <b>Word3</b>!]]></target> </segment> </unit> </group> </file> </xliff>"; XliffDocument document = LoadXliff(xliff); var splitter = new ParagraphSplitter(); var newDocument = splitter.ExecuteMerge(document); Assert.AreEqual(2, newDocument.Files[0].Containers.Count); var unit1 = newDocument.Files[0].Containers[0] as Unit; Assert.IsNotNull(unit1); Assert.AreEqual("u1", unit1.Id); Assert.AreEqual("original1", unit1.Name); var unit2 = newDocument.Files[0].Containers[1] as Unit; Assert.IsNotNull(unit2); Assert.AreEqual("u2", unit2.Id); Assert.AreEqual("original2", unit2.Name); var cdata1 = unit1.Resources[0].Target.Text[0] as CDataTag; Assert.IsNotNull(cdata1); Assert.AreEqual("<p>Hello Word1!</p><p>Hello Word2!</p><p>Hello <b>Word3</b>!</p>", cdata1.Text); var cdata2 = unit2.Resources[0].Target.Text[0] as CDataTag; Assert.IsNotNull(cdata2); Assert.AreEqual("<p>Hello2 Word1!</p><p>Hello2 Word2!</p><p>Hello2 <b>Word3</b>!</p>", cdata2.Text); }
private static void ProcessParserModelParagraphs(SimpleParserViewModel parse) { Dialect dialect = BindDialect(parse); ParagraphSplitter ps = new ParagraphSplitter(dialect); StringBuilder normalizedSb = new StringBuilder(); StringBuilder spitBackSb = new StringBuilder(); StringBuilder bracketSb = new StringBuilder(); StringBuilder posSb = new StringBuilder(); StringBuilder glossSb = new StringBuilder(); StringBuilder errors = new StringBuilder(); StringBuilder colorized = new StringBuilder(); HtmlFormatter hf = new HtmlFormatter(); Prose prose; try { prose = ps.ParseProse(parse.SourceText); } catch (Exception ex) { //We CAN'T. ProcessParserModelSentences(parse); return; } foreach (Paragraph paragraph in prose.Paragraphs) { //////// TP try { spitBackSb.AppendLine(paragraph.ToString("g", dialect).ToHtml() + "<br/>"); } catch (Exception ex) { string error = "[[CANNOT REPEAT BACK: " + ex.Message + "]]"; spitBackSb.AppendLine(hf.BoldTheWords(error.ToHtml()) + "<br/>"); //spitBackSb.AppendLine(hf.BoldTheWords(paragraph.ToHtml()) + "<br/>"); //UpdateErrors(errors, error, sentence); } try { //string result = parsedSentence.ToString("html", dialect); //if (result.Replace("<span", "").Contains("<")) //{ // throw new InvalidOperationException("No HTML allowed in input"); //} colorized.AppendLine(paragraph.ToString("html", dialect) + "<br/>"); } catch (Exception ex) { string error = "[[CANNOT COLORIZE: " + ex.Message + "]]"; spitBackSb.AppendLine(hf.BoldTheWords(error.ToHtml()) + "<br/>"); //spitBackSb.AppendLine(hf.BoldTheWords(sentence.ToHtml()) + "<br/>"); // //UpdateErrors(errors, error, sentence); } //////// TP try { bracketSb.AppendLine(hf.BoldTheWords(paragraph.ToString("b", dialect).ToHtml()) + "<br/>"); } catch (Exception ex) { string error = "[[CANNOT BRACKET: " + ex.Message + "]]"; bracketSb.AppendLine(hf.BoldTheWords(error.ToHtml()) + "<br/>"); //bracketSb.AppendLine(hf.BoldTheWords(sentence.ToHtml()) + "<br/>"); //UpdateErrors(errors, error, sentence); } //////// ENGLISH try { dialect.TargetGloss = "en"; GlossMaker gm = new GlossMaker(); string glossed = gm.GlossParagraph(paragraph, dialect); glossSb.AppendLine(glossed.ToHtml() + "<br/>"); glossed = gm.GlossParagraph(paragraph, dialect, true); posSb.AppendLine(glossed.ToHtml() + "<br/>"); //bs doesn't do anything. } catch (Exception ex) { string error = "[[CANNOT GLOSS: " + ex.Message.ToHtml() + "]]"; glossSb.AppendLine(hf.BoldTheWords(error.ToHtml()) + "<br/>"); //glossSb.AppendLine(hf.BoldTheWords(sentence.ToHtml()) + "<br/>"); posSb.AppendLine(hf.BoldTheWords(error.ToHtml()) + "<br/>"); //posSb.AppendLine(hf.BoldTheWords(sentence.ToHtml()) + "<br/>"); //UpdateErrors(errors, error, sentence); } //} //catch (Exception ex) //{ // string error = "[[CANNOT Parse: " + ex.Message.ToHtml() + "]]"; // foreach (StringBuilder sb in new StringBuilder[] { //normalizedSb, // spitBackSb, bracketSb, posSb, glossSb, colorized }) // { // sb.AppendLine(hf.BoldTheWords(error.ToHtml()) + "<br/>"); // sb.Append(sentence.ToHtml() + "<br/>"); // } // UpdateErrors(errors, error, sentence); //} //finally //{ // dialect.TargetGloss = "tp"; //} } parse.Normalized = normalizedSb.ToString(); parse.Recovered = spitBackSb.ToString(); parse.Formatted = bracketSb.ToString(); parse.FormattedPos = hf.SubThePartsOfSpeech(posSb.ToString()); parse.Glossed = glossSb.ToString(); parse.Colorized = colorized.ToString(); parse.Errors = errors.ToString(); }