示例#1
0
        // 文字コードに従って文字列をバイト配列に変換する(基本的に終端記号が必要)
        protected byte[] StringToBytes(CharsetCode tCode, string s, bool tEOS = true)
        {
            byte[] b = null;

            if (string.IsNullOrEmpty(s) == true)
            {
                if (tEOS == false)
                {
                    return(new byte[0]);
                }
                else
                {
                    return(new byte[] { 0 });
                }
            }

            switch (tCode)
            {
            case CharsetCode.CP932:
                b = Encoding.GetEncoding("shift_jis").GetBytes(s);
                break;

            case CharsetCode.UTF8:
                b = Encoding.UTF8.GetBytes(s);
                break;
            }

            if (tEOS == true)
            {
                Array.Resize(ref b, b.Length + 1);
                b[b.Length - 1] = 0;
            }

            return(b);
        }
示例#2
0
        // 文字コードに従ってバイト配列を文字列に変換する
        protected string BytesToString(CharsetCode tCode, byte[] b, int o = 0, int l = 0)
        {
            string s = null;

            if (l <= 0)
            {
                int i;
                for (i = o; i < b.Length; i++)
                {
                    if (b[i] == 0)
                    {
                        break;
                    }
                    l++;
                }
            }
            else
            {
                int i, c = 0;
                for (i = o; i < (o + l); i++)
                {
                    if (b[i] == 0)
                    {
                        break;
                    }
                    c++;
                }

                if (c < l)
                {
                    l = c;
                }
            }

            switch (tCode)
            {
            case CharsetCode.CP932:
                s = Encoding.GetEncoding("shift_jis").GetString(b, o, l);
                break;

            case CharsetCode.UTF8:
                s = Encoding.UTF8.GetString(b, o, l);
                break;
            }

            return(s);
        }
示例#3
0
        protected ushort GetUnicode(CharsetCode tCode, byte[] tSentence, int tBegin, int tEnd, ref int rLength)
        {
            ushort tUnicode = 0;

            switch (tCode)
            {
//					case EUC_JP		:	t = euc_to_ucs2(		begin, end, mblen ) ; break ;
            case CharsetCode.CP932: tUnicode = CP932ToUnicode(tSentence, tBegin, tEnd, ref rLength); break;

            case CharsetCode.UTF8: tUnicode = UTF8ToUnicode(tSentence, tBegin, tEnd, ref rLength); break;
//					case UTF16		:	t = utf16_to_ucs2(		begin, end, mblen ) ; break ;
//					case UTF16LE	:	t = utf16le_to_ucs2(	begin, end, mblen ) ; break ;
//					case UTF16BE	:	t = utf16be_to_ucs2(	begin, end, mblen ) ; break ;
//					case ASCII		:	t = ascii_to_ucs2(		begin, end, mblen ) ; break ;
//					default			:	t = utf8_to_ucs2(		begin, end, mblen ) ; break ;
            }

            return(tUnicode);
        }
示例#4
0
        //-----------------------------------------------------------

        /// <summary>
        /// 解析
        /// </summary>
        /// <param name="lattice"></param>
        /// <returns></returns>
        public string[] Analyze(string tText)
        {
            if (m_Tokenizer == null || m_Connector == null)
            {
                return(null);
            }

            //----------------------------------------------------------

            // 文字コードに応じて文字列をバイト配列に変換する
            CharsetCode tCode = m_Tokenizer.GetCharsetCode();

            byte[] tSentence = StringToBytes(tCode, tText);                     // 終端記号が必要
            int    tSize     = GetLength(tSentence);

            Node[] tBeginNodes = new Node[tSize + 4];
            Node[] tEndNodes   = new Node[tSize + 4];

            //----------------------------------------------------------

            if (Process(tSentence, tBeginNodes, tEndNodes) == false)
            {
                return(null);
            }

            if (BuildBestLattice(tEndNodes, tSize) == false)
            {
                return(null);
            }

            //----------------------------------------------------------

            tSize = 0;
            for (Node tNode = tEndNodes[0]; tNode != null; tNode = tNode.next)
            {
                if (tNode.stat != MECAB_BOS_NODE && tNode.stat != MECAB_EOS_NODE)
                {
                    tSize++;
                }
            }

            if (tSize == 0)
            {
                return(null);
            }

            string[] tFeature = new string[tSize];

            tSize = 0;
            for (Node tNode = tEndNodes[0]; tNode != null; tNode = tNode.next)
            {
                if (tNode.stat != MECAB_BOS_NODE && tNode.stat != MECAB_EOS_NODE)
                {
                    tFeature[tSize] =
                        BytesToString(tCode, tNode.surface_s, tNode.surface_o, tNode.length) +
                        "," +
                        BytesToString(tCode, tNode.feature_s, tNode.feature_o);

//					Debug.LogWarning( "--- feature : " + tSize + " = " + tFeature[ tSize ] ) ;

                    tSize++;
                }
            }

            return(tFeature);
        }
示例#5
0
        //---------------------------------------------------------------------------

        public bool Open(string tDirectory)
        {
            Close();

            // UNKDictionary Open
            m_UNKDictionary = new WordDictionary();
            if (m_UNKDictionary.Open(Path.Combine(tDirectory, UNK_DIC_FILE).Replace("\\", "/")) == false)
            {
                return(false);
            }

            //----------------------------------
            // SystemDictionary Open

            m_Dictionaries = new List <WordDictionary>();

            WordDictionary tSystemDictionary = new WordDictionary();

            if (tSystemDictionary.Open(Path.Combine(tDirectory, SYS_DIC_FILE).Replace("\\", "/")) == false)
            {
                return(false);
            }

            if (tSystemDictionary.Type != 0)
            {
                return(false);
            }

            // 文字コード文字列から文字コード識別値を取得する
            m_CharsetCode = GetCharsetCode(tSystemDictionary.Charset);

            // 辞書リストに追加する
            m_Dictionaries.Add(tSystemDictionary);

            //----------------------------------

            // CharProperty Open
            m_CharProperty = new CharProperty(m_CharsetCode);
            if (m_CharProperty.Open(tDirectory) == false)
            {
                return(false);
            }

            //----------------------------------

            int tLast = m_Dictionaries.Count - 1;

            m_LSize = ( uint )m_Dictionaries[tLast].LSize;
            m_RSize = ( uint )m_Dictionaries[tLast].RSize;

            //----------------------------------------------------------
            // UNKToken Open

            m_UNKTokens = new List <KeyValuePair <Token, int> >();

            for (int i = 0; i < m_CharProperty.Size; ++i)
            {
                byte[] tKey = m_CharProperty.GetName(i);

                DoubleArray.Word n = m_UNKDictionary.ExactMatchSearch(tKey);

                if (n.value == -1)
                {
                    Debug.LogWarning("cannot find UNK category: " + tKey);
                    return(false);
                }

                Token tToken = m_UNKDictionary.GetToken(n);
                int   tSize  = m_UNKDictionary.GetSize(n);

                m_UNKTokens.Add(new KeyValuePair <Token, int>(tToken, tSize));
            }

            //----------------------------------------------------------

            m_Space = m_CharProperty.GetCharInfo(0x20);                 // ad-hoc

            m_BOSFeature = StringToBytes(m_CharsetCode, BOS_FEATURE);
//			m_MaxGroupingSize = DEFAULT_MAX_GROUPING_SIZE ;

            return(true);
        }
		//---------------------------------------------------------------------------

		public CharProperty( CharsetCode tCharsetCode )
		{
			m_CharsetCode = tCharsetCode ;
		}