예제 #1
0
        /// <summary>
        /// 从已分析存储的辞典载入数据
        /// </summary>
        public static Lexicon FromExistLexiconFile(string existLexiconFile, EncodeScheme encode = EncodeScheme.Halfman)
        {
            //JiebaSegmenter segmenter = new JiebaSegmenter();
            Lexicon lexicon = new Lexicon();

            using (StreamReader sr = new StreamReader(existLexiconFile))
            {
                Regex  regex = new Regex("\\s");
                string line;
                while (!string.IsNullOrEmpty((line = sr.ReadLine())))
                {
                    string[] vals = regex.Split(line);
                    if (vals.Length == 2)
                    {
                        var a = lexicon.AddVocabulary(vals[0]);
                        lexicon._voca_array[a].Weight = int.Parse(vals[1]);
                    }
                }
                //
                lexicon.SortVocabulary();
            }
            //应用halfman编码
            if (encode == EncodeScheme.Halfman)
            {
                lexicon.UpdateHalfmanCode();
            }
            //应用one-hot编码
            else if (encode == EncodeScheme.Onehot)
            {
                lexicon.UpdateOnehotCode();
            }
            //
            return(lexicon);
        }
예제 #2
0
        /// <summary>
        /// 从原始文本文件中分析词句
        /// </summary>
        /// <param name="vocabularyFile"></param>
        /// <returns></returns>
        public static Lexicon FromVocabularyFile(string vocabularyFile, EncodeScheme encode = EncodeScheme.Halfman)
        {
            Lexicon lexicon = new Lexicon();

            //读取文本构建词库
            using (StreamReader sr = new StreamReader(vocabularyFile))
            {
                string line;
                while (!sr.EndOfStream)
                {
                    line = sr.ReadLine().ClearPunctuation();
                    if (string.IsNullOrEmpty(line) || string.IsNullOrWhiteSpace(line))
                    {
                        continue;
                    }
                    //sgement
                    string[] words = lexicon.Sgement(line);
                    Array.ForEach(words, word =>
                    {
                        if (!string.IsNullOrWhiteSpace(word))
                        {
                            lexicon._train_word_count++;
                            int i = lexicon.SearchVocabulary(word);
                            if (i == -1)
                            {
                                lexicon._voca_array[lexicon.AddVocabulary(word)].Weight = 1;
                            }
                            else
                            {
                                lexicon._voca_array[i].Weight++;
                            }
                            if (lexicon.VocaSize > _voca_hash_size * 0.7)
                            {
                                lexicon.ReduceVocabulary();
                            }
                        }
                    });
                }
                //1. 剔除达不到最低频次要求的词
                //2. 排序
                lexicon.SortVocabulary();
            }
            //应用halfman编码
            if (encode == EncodeScheme.Halfman)
            {
                lexicon.UpdateHalfmanCode();
            }
            //应用one-hot编码
            else if (encode == EncodeScheme.Onehot)
            {
                lexicon.UpdateOnehotCode();
            }
            //
            return(lexicon);
        }