// 文字コードに従って文字列をバイト配列に変換する(基本的に終端記号が必要) protected byte[] StringToBytes(CharsetCode tCode, string s, bool tEOS = true) { byte[] b = null; if (string.IsNullOrEmpty(s) == true) { if (tEOS == false) { return(new byte[0]); } else { return(new byte[] { 0 }); } } switch (tCode) { case CharsetCode.CP932: b = Encoding.GetEncoding("shift_jis").GetBytes(s); break; case CharsetCode.UTF8: b = Encoding.UTF8.GetBytes(s); break; } if (tEOS == true) { Array.Resize(ref b, b.Length + 1); b[b.Length - 1] = 0; } return(b); }
// 文字コードに従ってバイト配列を文字列に変換する protected string BytesToString(CharsetCode tCode, byte[] b, int o = 0, int l = 0) { string s = null; if (l <= 0) { int i; for (i = o; i < b.Length; i++) { if (b[i] == 0) { break; } l++; } } else { int i, c = 0; for (i = o; i < (o + l); i++) { if (b[i] == 0) { break; } c++; } if (c < l) { l = c; } } switch (tCode) { case CharsetCode.CP932: s = Encoding.GetEncoding("shift_jis").GetString(b, o, l); break; case CharsetCode.UTF8: s = Encoding.UTF8.GetString(b, o, l); break; } return(s); }
protected ushort GetUnicode(CharsetCode tCode, byte[] tSentence, int tBegin, int tEnd, ref int rLength) { ushort tUnicode = 0; switch (tCode) { // case EUC_JP : t = euc_to_ucs2( begin, end, mblen ) ; break ; case CharsetCode.CP932: tUnicode = CP932ToUnicode(tSentence, tBegin, tEnd, ref rLength); break; case CharsetCode.UTF8: tUnicode = UTF8ToUnicode(tSentence, tBegin, tEnd, ref rLength); break; // case UTF16 : t = utf16_to_ucs2( begin, end, mblen ) ; break ; // case UTF16LE : t = utf16le_to_ucs2( begin, end, mblen ) ; break ; // case UTF16BE : t = utf16be_to_ucs2( begin, end, mblen ) ; break ; // case ASCII : t = ascii_to_ucs2( begin, end, mblen ) ; break ; // default : t = utf8_to_ucs2( begin, end, mblen ) ; break ; } return(tUnicode); }
//----------------------------------------------------------- /// <summary> /// 解析 /// </summary> /// <param name="lattice"></param> /// <returns></returns> public string[] Analyze(string tText) { if (m_Tokenizer == null || m_Connector == null) { return(null); } //---------------------------------------------------------- // 文字コードに応じて文字列をバイト配列に変換する CharsetCode tCode = m_Tokenizer.GetCharsetCode(); byte[] tSentence = StringToBytes(tCode, tText); // 終端記号が必要 int tSize = GetLength(tSentence); Node[] tBeginNodes = new Node[tSize + 4]; Node[] tEndNodes = new Node[tSize + 4]; //---------------------------------------------------------- if (Process(tSentence, tBeginNodes, tEndNodes) == false) { return(null); } if (BuildBestLattice(tEndNodes, tSize) == false) { return(null); } //---------------------------------------------------------- tSize = 0; for (Node tNode = tEndNodes[0]; tNode != null; tNode = tNode.next) { if (tNode.stat != MECAB_BOS_NODE && tNode.stat != MECAB_EOS_NODE) { tSize++; } } if (tSize == 0) { return(null); } string[] tFeature = new string[tSize]; tSize = 0; for (Node tNode = tEndNodes[0]; tNode != null; tNode = tNode.next) { if (tNode.stat != MECAB_BOS_NODE && tNode.stat != MECAB_EOS_NODE) { tFeature[tSize] = BytesToString(tCode, tNode.surface_s, tNode.surface_o, tNode.length) + "," + BytesToString(tCode, tNode.feature_s, tNode.feature_o); // Debug.LogWarning( "--- feature : " + tSize + " = " + tFeature[ tSize ] ) ; tSize++; } } return(tFeature); }
//--------------------------------------------------------------------------- public bool Open(string tDirectory) { Close(); // UNKDictionary Open m_UNKDictionary = new WordDictionary(); if (m_UNKDictionary.Open(Path.Combine(tDirectory, UNK_DIC_FILE).Replace("\\", "/")) == false) { return(false); } //---------------------------------- // SystemDictionary Open m_Dictionaries = new List <WordDictionary>(); WordDictionary tSystemDictionary = new WordDictionary(); if (tSystemDictionary.Open(Path.Combine(tDirectory, SYS_DIC_FILE).Replace("\\", "/")) == false) { return(false); } if (tSystemDictionary.Type != 0) { return(false); } // 文字コード文字列から文字コード識別値を取得する m_CharsetCode = GetCharsetCode(tSystemDictionary.Charset); // 辞書リストに追加する m_Dictionaries.Add(tSystemDictionary); //---------------------------------- // CharProperty Open m_CharProperty = new CharProperty(m_CharsetCode); if (m_CharProperty.Open(tDirectory) == false) { return(false); } //---------------------------------- int tLast = m_Dictionaries.Count - 1; m_LSize = ( uint )m_Dictionaries[tLast].LSize; m_RSize = ( uint )m_Dictionaries[tLast].RSize; //---------------------------------------------------------- // UNKToken Open m_UNKTokens = new List <KeyValuePair <Token, int> >(); for (int i = 0; i < m_CharProperty.Size; ++i) { byte[] tKey = m_CharProperty.GetName(i); DoubleArray.Word n = m_UNKDictionary.ExactMatchSearch(tKey); if (n.value == -1) { Debug.LogWarning("cannot find UNK category: " + tKey); return(false); } Token tToken = m_UNKDictionary.GetToken(n); int tSize = m_UNKDictionary.GetSize(n); m_UNKTokens.Add(new KeyValuePair <Token, int>(tToken, tSize)); } //---------------------------------------------------------- m_Space = m_CharProperty.GetCharInfo(0x20); // ad-hoc m_BOSFeature = StringToBytes(m_CharsetCode, BOS_FEATURE); // m_MaxGroupingSize = DEFAULT_MAX_GROUPING_SIZE ; return(true); }
//--------------------------------------------------------------------------- public CharProperty( CharsetCode tCharsetCode ) { m_CharsetCode = tCharsetCode ; }