/// <summary> /// </summary> /// <param name="tk"></param> public virtual void Initialize(Tokenizer tk) { Tokenizer = tk; Action(); SetMap(); }
public void ConvertNew(IDCMapping[] mappings, IDCFileInfo[] fileInfos) { // Test(1, mappings[0], fileInfos[0]); m_tokenizer = new Tokenizer(); foreach (IDCMapping mapping in mappings) { if (mapping.MarkerEncoding.Length <= 0) mapping.MarkerEncoding = DefaultMarkerMap; if (mapping.DataEncoding.Length <= 0) mapping.DataEncoding = DefaultDataMap; MarkerSpec ms = MarkerSpec.CreateMarkerSpec(mapping); m_tokenizer.Tri.Add(ms); } Token token; string output; FileStream stream; StreamReader streamReader; Stream outputStream; bool fBOM; StreamWriter outputWriter = null; // Do for each input file in fileInfo System.Text.Encoding encoding; foreach (IDCFileInfo fileInfo in fileInfos) { stream = new FileStream(fileInfo.InputFileName, FileMode.Open, FileAccess.Read); bool fAlreadyUnicode = true; switch (fileInfo.FileEncoding) { case DCFileEncoding.DC_FE_BYTES: case DCFileEncoding.DC_FE_Unknown: encoding = ReversibleEncoding; fAlreadyUnicode = false; break; case DCFileEncoding.DC_FE_UTF16BE: encoding = System.Text.Encoding.BigEndianUnicode; break; case DCFileEncoding.DC_FE_UTF16LE: encoding = System.Text.Encoding.Unicode; break; case DCFileEncoding.DC_FE_UTF8: encoding = System.Text.Encoding.UTF8; break; default: Debug.Fail("Requested input file encoding not implemented."); encoding = ReversibleEncoding; fAlreadyUnicode = false; break; } streamReader = new StreamReader(stream, encoding); m_tokenizer.Input = streamReader; outputStream = new FileStream(fileInfo.OutputFileName, FileMode.Create, FileAccess.Write); fBOM = fileInfo.HasBOM; if (fBOM) { // Use StreamWriter if BOM needed. outputWriter = new StreamWriter(outputStream, System.Text.Encoding.UTF8); } do { // Enhance (BobbyD): It seems that all the StreamWriters output a BOM, // even if we don't want one. One solution is below, that is, write the data // using Streams instead of StreamWriters. Except this is sort of messy, // a cleaner solution may be to subclass Encoding.UTF8, override the method // GetPreamble() to return a zero length byte array, instead of the UTF8 BOM, // and then the nice and clean StreamWriter can be used. More information on // this is under the help topic of Encoding.GetPreamble Method. token = m_tokenizer.Next(); output = token.Output(fAlreadyUnicode); byte[] bytes = System.Text.Encoding.UTF8.GetBytes(output); if (token is NewlineToken) { if (fBOM) outputWriter.WriteLine(output); else { outputStream.Write(bytes, 0, bytes.Length); outputStream.WriteByte((byte)'\r'); outputStream.WriteByte((byte)'\n'); } } else { if (fBOM) outputWriter.Write(output); else outputStream.Write(bytes, 0, bytes.Length); } } while (!(token is EndOfFileToken)); m_tokenizer.Input.Close(); // close the input stream if (fBOM) outputWriter.Close(); else outputStream.Close(); } }
protected void RemoveEndMarkerFromTri(Tokenizer tk) { if (tk.TokenStack.Peek() is StartInlineToken) { StartInlineToken sik = (StartInlineToken)(tk.TokenStack.Peek()); StartInlineMarkerSpec sims = (StartInlineMarkerSpec)(sik.Spec); tk.Tri.Remove(sims.End); } }
protected void AddEndMarkerToTri(Tokenizer tk, StartInlineMarkerSpec sims) { tk.Tri.Add(sims.EndSpec); }
public void SetUpTokenizer() { MarkerClass markerClass = new MarkerClass(); m_tokenizer = new Tokenizer(); MarkerSpec ms; for (int i = 0; i < markerClass.markers.Length; i++) { MarkerClass.Marker marker = markerClass.markers[i]; // This test does not use an ECMapping object because it modifies replaceMarker // and endReplaceMarker automatically, and since we are just testing the tokenizer, // we don't want to have to account for markers being replaced. The replace marker functionality // is tested elsewhere. ms = MarkerSpec.CreateMarkerSpec(marker.m_marker, marker.m_markerMap, marker.m_dataMap, marker.m_isInline, marker.m_marker, marker.m_end, marker.m_end); m_tokenizer.Tri.Add(ms); } }
public void TestTokenConvert() { // loads converters from on disk XML encoding repository file EncConverters converters = new EncConverters(); // location of TECkit map files string mapDir = Info.TestFileDir; // writes three converters to XML encoding repository file on disk. converters.Add("ISO-8859-1<>UNICODE", mapDir + @"iso-8859-1.map", ConvType.Legacy_to_from_Unicode, "ISO-8859-1", "UNICODE", ProcessTypeFlags.UnicodeEncodingConversion); converters.Add("ASCII<>MIXED CASE UNICODE", mapDir + @"mixedcase.map", ConvType.Legacy_to_from_Unicode, "ISO-8859-1", "UNICODE", ProcessTypeFlags.UnicodeEncodingConversion); converters.Add("ASCII>UPPER CASE UNICODE", mapDir + @"uppercase.map", ConvType.Legacy_to_Unicode, "ISO-8859-1", "UNICODE", ProcessTypeFlags.UnicodeEncodingConversion); Token token = new DataToken(0, 0, @"Hello, World!"); Tokenizer tokenizer = new Tokenizer(); // Setting token.Tokenizer and token.Map simulates Initialize() // so a full test environment does not need to be created. token.Tokenizer = tokenizer; string rawString; // an empty string for the map name indicates a default Unicode conversion should be used token.Map = ""; rawString = token.RawOutput(); Assert.AreEqual(@"Hello, World!", token.ConvertToUnicode(rawString)); token.Map = "ISO-8859-1<>UNICODE"; rawString = token.RawOutput(); Assert.AreEqual(@"Hello, World!", token.ConvertToUnicode(rawString)); token.Map = "ASCII<>MIXED CASE UNICODE"; rawString = token.RawOutput(); Assert.AreEqual(@"hELLO,~~~wORLD!", token.ConvertToUnicode(rawString)); token.Map = "ASCII>UPPER CASE UNICODE"; rawString = token.RawOutput(); Assert.AreEqual(@"HELLO,~~WORLD!", token.ConvertToUnicode(rawString)); }
public void TestDontConvertUnicodeToken() { Token token = new DataToken(0, 0, @"Hello, World!"); Tokenizer tokenizer = new Tokenizer(); // Setting token.Tokenizer and token.Map simulates Initialize() // so a full test environment does not need to be created. token.Tokenizer = tokenizer; // Set a non-existent converter so we can check to make sure it does not get called token.Map = "Garbanzo"; Assert.AreEqual(@"Hello, World!", token.Output(true)); }