private static bool LoadDat(string path)
        {
            try
            {
                var ba = ByteArray.Create(path + Predefine.BIN_EXT);
                if (ba == null)
                {
                    return(false);
                }

                // 先读取values
                var size  = ba.NextInt();
                var attrs = new WordAttr[size];

                for (int i = 0; i < size; i++)
                {
                    var totalFreq = ba.NextInt();
                    var len       = ba.NextInt();
                    attrs[i]           = new WordAttr(len);
                    attrs[i].totalFreq = totalFreq;
                    for (int j = 0; j < len; j++)
                    {
                        attrs[i].natures[j] = (Nature)ba.NextInt();
                        attrs[i].freqs[j]   = ba.NextInt();
                    }
                }

                return(_trie.Load(ba, attrs) && !ba.HasMore());
            }
            catch (Exception e)
            {
                // log warning "dat file reading failed"
                return(false);
            }
        }
Beispiel #2
0
        public Vertex(string word, string realWord, WordAttr attr, int wordId)
        {
            if (attr == null)
            {
                this.attr = new WordAttr(Nature.n, 1);
            }
            else
            {
                this.attr = attr;
            }

            this.wordId = wordId;
            if (word == null)
            {
                this.word = CompileRealWord(realWord, this.attr);
            }
            else
            {
                this.word = word;
            }

            //Debug.Assert(realWord.Length > 0, "构造空白节点会导致死循环");
            if (realWord.Length <= 0)
            {
                throw new Exception("构造空白节点会导致死循环");
            }
            this.realWord = realWord;
        }
Beispiel #3
0
        /// <summary>
        ///
        /// </summary>
        /// <param name="terms"></param>
        /// <param name="appendStart">是否附加起始辅助节点</param>
        /// <returns></returns>
        public static List <Vertex> ToVertexList(List <Term> terms, bool appendStart)
        {
            var vertices = new List <Vertex>(terms.Count + 1);

            if (appendStart)
            {
                vertices.Add(Vertex.B);
            }
            for (int i = 0; i < terms.Count; i++)
            {
                var term = terms[i];
                var attr = CoreDictionary.GetAttr(term.word);
                if (attr == null)
                {
                    if (string.IsNullOrWhiteSpace(term.word))
                    {
                        attr = new WordAttr(Nature.x);              // 普通字符串
                    }
                    else
                    {
                        attr = new WordAttr(Nature.nz);             // 其他专名
                    }
                }
                else
                {
                    term.nature = attr.natures[0];                  //! 修改原始Term词条的词性
                }
                vertices.Add(new Vertex(term.word, attr));
            }
            return(vertices);
        }
Beispiel #4
0
        /// <summary>
        /// 往自定义词典中插入一个新词(覆盖模式)
        /// 动态增删不会持久化到词典文件
        /// </summary>
        /// <param name="word">新词,如“裸婚”</param>
        /// <param name="natWithFreq">词性以及词频,默认为"nz 1"</param>
        /// <returns></returns>
        public static bool Insert(string word, string natWithFreq = null)
        {
            if (string.IsNullOrEmpty(word))
            {
                return(false);
            }
            if (Config.NormalizeChar)
            {
                word = CharTable.Convert(word);
            }

            var attr = string.IsNullOrEmpty(natWithFreq)
                ? new WordAttr(Nature.nz, 1)
                : WordAttr.Create(natWithFreq);

            if (attr == null)
            {
                return(false);
            }

            if (dat.Set(word, attr))
            {
                return(true);
            }
            if (binTrie == null)
            {
                binTrie = new BinTrie <WordAttr>();
            }
            binTrie.Put(word, attr);
            return(true);
        }
Beispiel #5
0
        private static bool Load(string path, Nature defNat, SortedDictionary <string, WordAttr> dict)
        {
            try
            {
                var splitter = new[] { ' ', '\t' };
                if (path.EndsWith(".csv"))
                {
                    splitter = new[] { ',' }
                }
                ;

                foreach (var line in File.ReadLines(path))
                {
                    var segs = line.Split(splitter);
                    if (segs.Length == 0)
                    {
                        continue;
                    }

                    if (Config.NormalizeChar)
                    {
                        segs[0] = CharTable.Convert(segs[0]);
                    }

                    var      natCount = (segs.Length - 1) / 2;
                    WordAttr attr;
                    if (natCount == 0)
                    {
                        attr = new WordAttr(defNat);
                    }
                    else
                    {
                        attr = new WordAttr(natCount);
                        for (int i = 0; i < natCount; i++)
                        {
                            attr.natures[i] = NatureHelper.GetOrCreate(segs[1 + (i << 1)]);
                            attr.freqs[i]   = int.Parse(segs[(i + 1) << 1]);
                            attr.totalFreq += attr.freqs[i];
                        }
                    }
                    dict[segs[0]] = attr;
                }
                return(true);
            }
            catch (Exception e)
            {
                return(false);
            }
        }
Beispiel #6
0
        /// <summary>
        /// 将原词转为等效词
        /// </summary>
        /// <param name="realWord">原来的词</param>
        /// <param name="attr">等效词串</param>
        /// <returns></returns>
        private string CompileRealWord(string realWord, WordAttr attr)
        {
            if (attr.natures.Length == 1)
            {
                switch (attr.natures[0])
                {
                case var x when x >= Nature.nr && x <= Nature.nr2:          // 人名
                    wordId = CoreDictionary.NR_WORD_ID;
                    return(TAG_PEOPLE);

                case Nature.ns:                                             // 地名
                case Nature.nsf:
                    wordId = CoreDictionary.NS_WORD_ID;
                    return(TAG_PLACE);

                case Nature.nx:                                             // 专有名词
                    wordId    = CoreDictionary.NX_WORD_ID;
                    this.attr = CoreDictionary.GetAttr(CoreDictionary.NX_WORD_ID);
                    return(TAG_PROPER);

                case var x when x >= Nature.nt && x <= Nature.nth || x == Nature.nit:
                    wordId = CoreDictionary.NT_WORD_ID;
                    return(TAG_GROUP);

                case Nature.m:
                case Nature.mq:
                    wordId    = CoreDictionary.M_WORD_ID;
                    this.attr = CoreDictionary.GetAttr(CoreDictionary.M_WORD_ID);
                    return(TAG_NUMBER);

                case Nature.x:
                    wordId    = CoreDictionary.X_WORD_ID;
                    this.attr = CoreDictionary.GetAttr(CoreDictionary.X_WORD_ID);
                    return(TAG_CLUSTER);

                case Nature.t:
                    wordId    = CoreDictionary.T_WORD_ID;
                    this.attr = CoreDictionary.GetAttr(CoreDictionary.T_WORD_ID);
                    return(TAG_TIME);
                }
            }
            return(realWord);
        }
        private static bool Load()
        {
            if (LoadDat(Config.Core_Dict_Path))
            {
                return(true);
            }

            var dict = new SortedDictionary <string, WordAttr>(StrComparer.Default);

            try
            {
                int max_freq = 0;
                foreach (var line in File.ReadLines(Config.Core_Dict_Path))
                {
                    var segs     = line.Split(new[] { ' ', '\t' }, StringSplitOptions.RemoveEmptyEntries); // Regex.Split(line, @"\s");
                    var natCount = (segs.Length - 1) / 2;
                    var attr     = new WordAttr(natCount);
                    for (int i = 0; i < natCount; i++)
                    {
                        attr.natures[i] = (Nature)Enum.Parse(typeof(Nature), segs[1 + (i << 1)]);
                        attr.freqs[i]   = int.Parse(segs[(i + 1) << 1]);
                        attr.totalFreq += attr.freqs[i];
                    }
                    dict[segs[0]] = attr;
                    max_freq     += attr.totalFreq;
                }
                _trie.Build(dict);

                SaveDat(Config.Core_Dict_Path, dict);
                return(true);
            }
            catch (FileNotFoundException e)
            {
                // log warning "core dictionary file does not exist"
                return(false);
            }
            catch (IOException e)
            {
                // log warning "core dictionary file read error"
                return(false);
            }
        }
Beispiel #8
0
        private static bool LoadDat(string path)
        {
            try
            {
                var ba = ByteArray.Create(path + Predefine.BIN_EXT);
                if (ba == null)
                {
                    return(false);
                }

                int size = ba.NextInt();
                if (size < 0)    // 一种兼容措施,当Size小于零表示文件头存储了-Size个用户词性
                {
                    while (size < 0)
                    {
                        var customNat = ba.NextString();
                        NatureHelper.GetOrCreate(customNat);    // register user-defined nature
                        size++;
                    }
                    size = ba.NextInt();
                }
                var attrs = new WordAttr[size];

                for (int i = 0; i < size; i++)       // 加载values
                {
                    var totalFreq = ba.NextInt();
                    var len       = ba.NextInt();
                    attrs[i]           = new WordAttr(len);
                    attrs[i].totalFreq = totalFreq;
                    for (int j = 0; j < len; j++)
                    {
                        attrs[i].natures[j] = (Nature)ba.NextInt();
                        attrs[i].freqs[j]   = ba.NextInt();
                    }
                }
                return(dat.Load(ba, attrs));     // 加载keys
            }
            catch (Exception e)
            {
                return(false);
            }
        }
Beispiel #9
0
        /// <summary>
        /// 将词性锁定为指定参数nature
        /// </summary>
        /// <param name="nature">词性</param>
        /// <returns>如果锁定词典在词性列表中,返回true,否则返回false</returns>
        public bool ConfirmNature(Nature nature)
        {
            // 如果只有一个词性且为参数指定词性,则返回true
            if (attr.natures.Length == 0 && attr.natures[0] == nature)
            {
                return(true);
            }

            // 否则,需要重新设置属性,使得词性数组中只有一个锁定的词性
            bool res  = true;
            var  freq = attr.GetFreq(nature);

            if (freq == 0)
            {
                freq = 1000;
                res  = false;
            }
            attr = new WordAttr(nature, freq);
            return(res);
        }
Beispiel #10
0
 /// <summary>
 /// 合并连续的词条形成一个更长的词语并更新到第一个位置,其余原位置上的词条全部重置为null
 /// </summary>
 /// <param name="vertices">词条列表</param>
 /// <param name="start">词条起始位置 inclusive</param>
 /// <param name="end">词条结束位置 exclusive</param>
 /// <param name="attr">目标词语对应的属性</param>
 private static void CombineWords(Vertex[] vertices, int start, int end, WordAttr attr)
 {
     if (start + 1 == end)    // 要合并的词列表中只有一个词
     {
         vertices[start].attr = attr;
     }
     else
     {
         var sb = new StringBuilder();
         for (int i = start; i < end; i++)
         {
             if (vertices[i] == null)
             {
                 continue;                           // 跳过
             }
             var realWord = vertices[i].realWord;
             sb.Append(realWord);                    // 合并词语(形成一个更长的词语)
             vertices[i] = null;
         }
         vertices[start] = new Vertex(sb.ToString(), attr);
     }
 }
Beispiel #11
0
        /// <summary>
        /// 如果词条存在,更新词条的属性
        /// </summary>
        /// <param name="key">词条</param>
        /// <param name="attr">词条属性</param>
        /// <param name="dict">加载期间的词条字典</param>
        /// <param name="rewriteDict">核心词典被更新的记录字典</param>
        /// <returns>更新是否成功</returns>
        private static bool UpdateAttrIfExist(string key, WordAttr attr, SortedDictionary <string, WordAttr> dict, SortedDictionary <int, WordAttr> rewriteDict)
        {
            var      wordID = CoreDictionary.GetWordId(key);
            WordAttr attrExisted;

            if (wordID != -1)
            {
                attrExisted           = CoreDictionary.GetAttr(wordID);
                attrExisted.natures   = attr.natures;
                attrExisted.freqs     = attr.freqs;
                attrExisted.totalFreq = attr.totalFreq;
                rewriteDict[wordID]   = attr;
                return(true);
            }

            if (dict.TryGetValue(key, out attrExisted))
            {
                attrExisted.natures   = attr.natures;
                attrExisted.freqs     = attr.freqs;
                attrExisted.totalFreq = attr.totalFreq;
                return(true);
            }
            return(false);
        }
Beispiel #12
0
 public Vertex(string realWord, WordAttr attr, int wordId) : this(null, realWord, attr, wordId)
 {
 }
Beispiel #13
0
 public Vertex(string realWord, WordAttr attr) : this(null, realWord, attr, -1)
 {
 }
Beispiel #14
0
 public Vertex(char realWord, WordAttr attr) : this(realWord.ToString(), attr)
 {
 }