/// <summary> /// BIG5轉UTF8文字 /// 不適用軍網轉民網,軍網有許多造字編碼會與全字庫編碼相衝突 /// </summary> /// <param name="path"></param> /// <param name="memo"></param> /// <returns></returns> public string ReplaceOutUt8(string content) { try { XmlDocument xmlSummary = GlobalParameters.SummaryTrans; Encoding big5 = Encoding.GetEncoding("big5"); Encoding utf8 = Encoding.UTF8; StringBuilder strResult = new StringBuilder(); StringBuilder strUnknowMsg = new StringBuilder(); string strFillUnknowWord = UnKnowWord(big5); //無法轉碼呈現文字 char[] arr = content.ToCharArray(); for (int i = 0; i < arr.Length; i++) { byte[] ByteArrayUtf8 = big5.GetBytes(arr[i].ToString().ToCharArray()); string bigCode = BitConverter.ToString(ByteArrayUtf8); //string bigCode = big5.GetString(ByteArrayUtf8); XmlNode specificNode = xmlSummary.SelectSingleNode("//Encoding/BIG5Code[text()='" + bigCode.Replace("-", "") + "']"); if (specificNode == null) { strResult.Append(utf8.GetString(ByteArrayUtf8)); } else { XmlNode UTF8Node = specificNode.ParentNode.SelectSingleNode("UTF8Code"); if (UTF8Node.InnerText.Length > 0) { //運算後的位元組長度:16進位數字字串長/2 byte[] byteOUT = new byte[UTF8Node.InnerText.Length / 2]; for (int k = 0; k < UTF8Node.InnerText.Length; k = k + 2) { //每2位16進位數字轉換為一個10進位整數 byteOUT[k / 2] = Convert.ToByte(UTF8Node.InnerText.Substring(k, 2), 16); } strResult.Append(utf8.GetString(byteOUT)); } else { Abnormal ukw = new Abnormal { WordCount = (i + 1).ToString(), OriginWord = GetUnknowString(content, i, 10) }; UnknowWords.Add(ukw); strUnknowMsg.AppendLine(ukw.ToString()); strResult.Append(strFillUnknowWord); } } } strResult.AppendLine(); return(strResult.ToString()); } catch (Exception ex) { throw ex; } }
/// <summary> /// BIG5轉UTF8文字 /// 適用軍網轉民網轉碼 /// </summary> /// <param name="path"></param> /// <returns></returns> public string filterOutUTF8ExtraChar(string content) { try { Encoding big5 = Encoding.GetEncoding("big5"); Encoding utf8 = new UTF8Encoding(false); StringBuilder strResult = new StringBuilder(); StringBuilder strUnknowMsg = new StringBuilder(); string strFillUnknowWord = UnKnowWord(big5); //無法轉碼呈現文字 //if (output == null) output = new File.Create(@"c:\temp\BIG5.TXT"); XmlDocument xmlSummary = GlobalParameters.SummaryTrans; char[] arr = content.ToCharArray(); for (int i = 0; i < arr.Length; i++) { byte[] ByteArrayUtf8 = big5.GetBytes(arr[i].ToString().ToCharArray()); if (ByteArrayUtf8.Length == 2) { int code = ByteArrayUtf8[0] * 256 + ByteArrayUtf8[1]; //int value = Hex(String.Format("{0:X}", code)); //Debug.Print(arr[i].ToString()); if ((code >= 0x8140 && code <= 0xA0FE) || (code >= 0xC6A1 && code <= 0xC8FE)) { XmlNode specificNode = xmlSummary.SelectSingleNode("//Encoding/BIG5Code[text()='" + code.ToString("X") + "']"); if (specificNode == null) { Abnormal ukw = new Abnormal { WordCount = (i + 1).ToString(), OriginWord = GetUnknowString(content, i, 10), TransWord = strFillUnknowWord }; UnknowWords.Add(ukw); strUnknowMsg.AppendLine(ukw.ToString()); strResult.Append(strFillUnknowWord); } else { // //specificNode = specificNode.ParentNode; XmlNode Big5Node = specificNode.ParentNode.SelectSingleNode("UTF8Code"); if (Big5Node.InnerText.Length > 0) { //testOutUTF8(specificNode.ParentNode.SelectSingleNode("UTF8Code").InnerText); //testOutUnicode(specificNode.ParentNode.SelectSingleNode("UnicodeCode").InnerText); //Encoding u8 = Encoding.UTF8; //Encoding u = Encoding.Unicode; //byte[] utf8Bytes = Util.HexToBytes(specificNode.ParentNode.SelectSingleNode("UnicodeCode").InnerText); //Console.WriteLine(u.GetString(utf8Bytes)); // Convert the string into a byte[]. //byte[] utf8Bytes = utf8.GetBytes(specificNode.ParentNode.SelectSingleNode("UnicodeCode").InnerText); //// Perform the conversion from one encoding to the other. //byte[] defaultBytes = Encoding.Convert(u, u8, utf8Bytes); //char[] defaultChars = new char[u8.GetCharCount(defaultBytes, 0, defaultBytes.Length)]; //u8.GetChars(defaultBytes, 0, defaultBytes.Length, defaultChars, 0); //string defaultString = new string(defaultChars); ////byte[] bytes = new byte[UTf8Node.InnerText.Length * sizeof(char)]; ////Buffer.BlockCopy(UTf8Node.InnerText.ToCharArray(), 0, bytes, 0, bytes.Length); //int byteLength = UTf8Node.InnerText.Length / 2; //byte[] bytes = new byte[byteLength]; //string hex; //int j = 0; //for (int k = 0; k < bytes.Length; k++) //{ // hex = new String(new Char[] { UTf8Node.InnerText[j], UTf8Node.InnerText[j + 1] }); // bytes[k] = HexToByte(hex); // j = j + 2; //} //運算後的位元組長度:16進位數字字串長/2 byte[] byteOUT = new byte[Big5Node.InnerText.Length / 2]; for (int k = 0; k < Big5Node.InnerText.Length; k = k + 2) { //每2位16進位數字轉換為一個10進位整數 byteOUT[k / 2] = Convert.ToByte(Big5Node.InnerText.Substring(k, 2), 16); } strResult.Append(utf8.GetString(byteOUT)); //紀錄已轉碼文字 Abnormal tfw = new Abnormal { WordCount = (i + 1).ToString(), OriginWord = GetUnknowString(content, i, 10), TransWord = utf8.GetString(byteOUT) }; TransferedWords.Add(tfw); } else { } } } else if (code >= 0xA140 && code <= 0xA3BF) { //標點符號、希臘字母及特殊符號,包括在0xA259-0xA261,安放了九個計量用漢字:兙兛兞兝兡兣嗧瓩糎 strResult.Append(TransferWordB2U(arr[i].ToString().ToCharArray())); } else if (code >= 0xA3C0 && code <= 0xA3FE) { //保留。此區沒有開放作造字區用 Abnormal ukw = new Abnormal { WordCount = (i + 1).ToString(), OriginWord = GetUnknowString(content, i, 10), TransWord = strFillUnknowWord }; UnknowWords.Add(ukw); strUnknowMsg.AppendLine(ukw.ToString()); strResult.Append(strFillUnknowWord); //strResult.AppendLine(string.Format(@"第{0}行,第{1}個字,起始位置為:" + StartTag + ",來源(UTF-8)內容為{2}", LineCount, i + 1, GetUnknowString(line, i, 10))); } else if (code >= 0xA440 && code <= 0xC67E) { //常用漢字 strResult.Append(TransferWordB2U(arr[i].ToString().ToCharArray())); } //else if (code >= 0xC6A1 && code <= 0xC8FE) //{ //保留給使用者自定義字元(造字區) // Abnormal ukw = new Abnormal // { // WordCount = (i + 1).ToString(), // OriginWord = GetUnknowString(content, i, 10) // }; // UnknowWords.Add(ukw); // strUnknowMsg.AppendLine(ukw.ToString()); // strResult.Append(strFillUnknowWord); // //strResult.AppendLine(string.Format(@"第{0}行,第{1}個字,起始位置為:" + StartTag + ",來源(UTF-8)內容為{2}", LineCount, i + 1, GetUnknowString(line, i, 10))); //} else if (code >= 0xC940 && code <= 0xF9D5) { //次常用漢字 strResult.Append(TransferWordB2U(arr[i].ToString().ToCharArray())); } else if (code >= 0xF9D6 && code <= 0xF9FE) { //保留給使用者自定義字元(造字區) (1)CP950 添加了 7 個倚天中文系統增加的字元「碁銹裏墻恒粧嫺」(俗稱「倚天字」)和 34 個畫圖和製表符號(ref: https://zh.wikipedia.org/wiki/代碼頁950) strResult.Append(TransferWordB2U(arr[i].ToString().ToCharArray())); } else if (code > 0xF9FE && code <= 0xFEFE) { //保留給使用者自定義字元(造字區) (2) Abnormal ukw = new Abnormal { WordCount = (i + 1).ToString(), OriginWord = GetUnknowString(content, i, 10), TransWord = strFillUnknowWord }; UnknowWords.Add(ukw); strUnknowMsg.AppendLine(ukw.ToString()); strResult.Append(strFillUnknowWord); //strResult.AppendLine(string.Format(@"第{0}行,第{1}個字,起始位置為:" + StartTag + ",來源(UTF-8)內容為{2}", LineCount, i + 1, GetUnknowString(line, i, 10))); } else { Abnormal ukw = new Abnormal { WordCount = (i + 1).ToString(), OriginWord = GetUnknowString(content, i, 10), TransWord = strFillUnknowWord }; UnknowWords.Add(ukw); strUnknowMsg.AppendLine(ukw.ToString()); strResult.Append(strFillUnknowWord); //strResult.AppendLine(string.Format(@"第{0}行,第{1}個字,起始位置為:" + StartTag + ",來源(UTF-8)內容為{2}", LineCount, i + 1, GetUnknowString(line, i, 10))); } } else { strResult.Append(utf8.GetString(ByteArrayUtf8)); } } return(strResult.ToString()); } catch (Exception ex) { return(null); } }