Beispiel #1
0
        private static bool Load(string path)
        {
            _trie = new DoubleArrayTrie <string>();
            if (LoadDat(path + ".bi" + Predefine.BIN_EXT))
            {
                return(true);
            }

            var map = new SortedDictionary <string, string>(StrComparer.Default);

            foreach (var line in File.ReadLines(path))
            {
                var param = line.Split(' ');
                if (param[0].EndsWith("@"))
                {
                    continue;
                }

                var dependency = param[1];
                map[param[0]] = dependency;
            }

            if (map.Count == 0)
            {
                return(false);
            }
            _trie.Build(map);
            if (!SaveDat(path, map))
            {
                // log error
            }
            return(true);
        }
        //public static BaseSearcher getSearcher(char[] charArray, DoubleArrayTrie<String> trie)
        //{
        //    return new Searcher(charArray, trie);
        //}

        protected static String segLongest(char[] charArray, DoubleArrayTrie <String> trie)
        {
            //StringBuilder sb = new StringBuilder(charArray.Length);
            //BaseSearcher searcher = getSearcher(charArray, trie);
            //KeyValuePair<String, String> entry;
            //int p = 0;  // 当前处理到什么位置
            //int offset;
            //while ((entry = searcher.next()) != null)
            //{
            //    offset = searcher.getOffset();
            //    // 补足没查到的词
            //    while (p < offset)
            //    {
            //        sb.Append(charArray[p]);
            //        ++p;
            //    }
            //    sb.Append(entry.Value);
            //    p = offset + entry.Key.Length;
            //}
            //// 补足没查到的词
            //while (p < charArray.Length)
            //{
            //    sb.Append(charArray[p]);
            //    ++p;
            //}
            //return sb.ToString();
            return(null);
        }
        public static string Seg4Longest(char[] chars, DoubleArrayTrie <string> trie)
        {
            var sb       = new StringBuilder(chars.Length);
            var searcher = new Searcher(chars, trie);

            int pos = 0;
            int offset;
            Tuple <string, string> t;

            while ((t = searcher.Next()) != null)
            {
                offset = searcher.Offset;
                // 补足没有查到的词
                if (pos < offset)
                {
                    sb.Append(chars, pos, offset - pos);
                }

                sb.Append(t.Item2);
                pos = offset + t.Item2.Length;
            }
            if (pos < chars.Length)
            {
                sb.Append(chars, pos, chars.Length - pos);
            }

            return(sb.ToString());
        }
        void BuildTokenInfoDictionary(string inputDirname, string outputDirname, Encoding encoding, bool compactTries)
        {
            ProgressLog.Begin("compiling tokeninfo dict");

            var tokenInfoCompiler = GetTokenInfoDictionaryCompiler(encoding);

            ProgressLog.Println("analyzing dictionary features");
            using (var stream = tokenInfoCompiler.CombinedSequentialFileInputStream(inputDirname))
            {
                tokenInfoCompiler.AnalyzeTokenInfo(stream);
            }

            ProgressLog.Println("reading tokeninfo");
            using (var stream = tokenInfoCompiler.CombinedSequentialFileInputStream(inputDirname))
            {
                tokenInfoCompiler.ReadTokenInfo(stream);
            }

            tokenInfoCompiler.Compile();

            var surfaces = tokenInfoCompiler.Surfaces;

            ProgressLog.Begin("compiling double array trie");
            using (var fs = new FileStream(Path.Combine(outputDirname, DoubleArrayTrie.DoubleArrayTrieFileName), FileMode.Create, FileAccess.ReadWrite))
            {
                var trie = DoubleArrayTrieCompiler.Build(surfaces, compactTries);
                trie.Write(fs);
            }

            ProgressLog.Println("validating saved double array trie");
            DoubleArrayTrie daTrie;

            using (var fs = new FileStream(Path.Combine(outputDirname, DoubleArrayTrie.DoubleArrayTrieFileName), FileMode.Open, FileAccess.Read))
            {
                daTrie = DoubleArrayTrie.Read(fs);
                foreach (var surface in surfaces)
                {
                    if (daTrie.Lookup(surface) < 0)
                    {
                        ProgressLog.Println("failed to look up [" + surface + "]");
                    }
                }
            }

            ProgressLog.End();

            ProgressLog.Begin("processing target map");

            for (var i = 0; i < surfaces.Count; i++)
            {
                int id = daTrie.Lookup(surfaces[i]);
                tokenInfoCompiler.AddMapping(id, i);
            }

            tokenInfoCompiler.Write(outputDirname); // TODO: Should be refactored -Christian

            ProgressLog.End();

            ProgressLog.End();
        }
Beispiel #5
0
        bool ProcessIndex(ViterbiLattice lattice, int startIndex, string suffix)
        {
            var found = false;

            for (var endIndex = 1; endIndex < suffix.Length + 1; endIndex++)
            {
                var prefix = suffix.Substring(0, endIndex);
                var result = DoubleArrayTrie.Lookup(prefix);

                if (result > 0)
                {
                    found = true; // Don't produce unknown word starting from this index
                    foreach (var wordId in Dictionary.LookupWordIds(result))
                    {
                        var node = new ViterbiNode(wordId, prefix, Dictionary, startIndex, ViterbiNode.NodeType.Known);
                        lattice.AddNode(node, startIndex + 1, startIndex + 1 + endIndex);
                    }
                }
                else if (result < 0)
                {
                    // If result is less than zero, continue to next position
                    break;
                }
            }

            return(found);
        }
Beispiel #6
0
 protected internal virtual void LoadDictionaries()
 {
     DoubleArrayTrie      = DoubleArrayTrie.NewInstance(Resolver);
     ConnectionCosts      = ConnectionCosts.NewInstance(Resolver);
     TokenInfoDictionary  = TokenInfoDictionary.NewInstance(Resolver);
     CharacterDefinitions = CharacterDefinitions.NewInstance(Resolver);
     UnknownDictionary    = UnknownDictionary.NewInstance(Resolver, CharacterDefinitions, TotalFeatures);
     InsertedDictionary   = new InsertedDictionary(TotalFeatures);
 }
Beispiel #7
0
        public bool Load(string path)
        {
            _trie = new DoubleArrayTrie <V>();
            var valueArr = OnLoadValue(path);

            if (valueArr == null)
            {
                // log info ""
                return(false);
            }
            if (LoadDat(path + ".trie.dat", valueArr))
            {
                // log info ""
                return(true);
            }

            var keys = new List <string>(valueArr.Length);

            try
            {
                foreach (var line in File.ReadLines(path))
                {
                    if (string.IsNullOrWhiteSpace(line))
                    {
                        continue;
                    }
                    var segs = line.Split(new[] { ' ', '\t' }, StringSplitOptions.RemoveEmptyEntries);
                    keys.Add(segs[0]);
                }
            }
            catch (Exception e) { }

            var error = _trie.Build(keys, valueArr);

            if (error != 0)              // 出错
            {
                var map = new SortedDictionary <string, V>(StrComparer.Default);
                for (int i = 0; i < valueArr.Length; i++)
                {
                    map[keys[i]] = valueArr[i];
                }
                _trie = new DoubleArrayTrie <V>();
                _trie.Build(map);
                int j = 0;
                foreach (var v in map.Values)
                {
                    valueArr[j++] = v;
                }
            }

            var fs = new FileStream(path + ".trie.dat", FileMode.Create, FileAccess.Write);

            _trie.Save(fs);
            fs.Close();
            OnSaveValue(valueArr, path);
            return(true);
        }
Beispiel #8
0
        private static bool Load()
        {
            _trie = new DoubleArrayTrie <bool>();
            if (LoadDat())
            {
                return(true);
            }

            // 从原始字符串编码文件读取词典数据
            try
            {
                var map         = new SortedDictionary <string, bool>(StrComparer.Default); // 翻译人名,存在,则value为true
                var charFreqMap = new SortedDictionary <char, int>();                       // 统计翻译人名中的各字符的频次
                foreach (var line in File.ReadLines(Config.Translated_Person_Dict_Path))
                {
                    map[line] = true;
                    foreach (var c in line)
                    {
                        if ("不赞".IndexOf(c) >= 0)
                        {
                            continue;                           // 排除一些不常用的字
                        }
                        if (charFreqMap.TryGetValue(c, out int f))
                        {
                            charFreqMap[c] = f + 1;
                        }
                        else
                        {
                            charFreqMap[c] = 1;
                        }
                    }
                }

                map["·"] = true;

                foreach (var p in charFreqMap)
                {
                    if (p.Value < 10)
                    {
                        continue;                           // 如果单字符频次小于10,则忽略
                    }
                    map[p.Key.ToString()] = true;           // 否则视为一个名称的简称,认为是一个有效名
                }

                _trie.Build(map);
                return(true);
            }
            catch (Exception e)
            {
                return(false);
            }
        }
Beispiel #9
0
        public ViterbiBuilder(DoubleArrayTrie doubleArrayTrie, TokenInfoDictionary dictionary, UnknownDictionary unknownDictionary, UserDictionary userDictionary, TokenizerMode mode)
        {
            DoubleArrayTrie   = doubleArrayTrie;
            Dictionary        = dictionary;
            UnknownDictionary = unknownDictionary;
            UserDictionary    = userDictionary;

            UseUserDictionary = userDictionary != null;

            SearchMode = mode == TokenizerMode.Search || mode == TokenizerMode.Extended;

            CharacterDefinitions = UnknownDictionary.CharacterDefinition;
        }
Beispiel #10
0
        public static DoubleArrayTrie Build(List <string> surfaces, bool compact)
        {
            var trie = new Trie.Trie();

            foreach (var surface in surfaces)
            {
                trie.Add(surface);
            }
            var doubleArrayTrie = new DoubleArrayTrie(compact);

            doubleArrayTrie.Build(trie);

            return(doubleArrayTrie);
        }
Beispiel #11
0
        static CoreDictionary()
        {
            Trie = new DoubleArrayTrie <Attribute>();
            var watch = Stopwatch.StartNew();

            if (!Load(path))
            {
                Predefine.logger.Error("核心词典" + path + "加载失败");
            }
            else
            {
                watch.Stop();
                Predefine.logger.Info(path + "加载成功," + Trie.Length + "个词条,耗时" + watch.ElapsedMilliseconds + "ms");
            }
        }
 public Searcher(int offset, char[] chars, DoubleArrayTrie <V> dat)
 {
     _dat    = dat;
     charArr = chars;
     i       = offset;
     last    = dat._base[0];
     if (chars.Length == 0)
     {
         begin = -1;
     }
     else
     {
         begin = offset;
     }
 }
Beispiel #13
0
        private static bool Load()
        {
            if (LoadDat(Config.Custom_Dict_Path[0]))
            {
                return(true);
            }

            dat = new DoubleArrayTrie <WordAttr>();

            var dict = new SortedDictionary <string, WordAttr>(StrComparer.Default);

            try
            {
                for (var i = 0; i < Config.Custom_Dict_Path.Length; i++)
                {
                    var p        = Config.Custom_Dict_Path[i];  // 当前自定义词典文件路径
                    var defNat   = Nature.n;
                    int spaceIdx = p.IndexOf(' ');
                    if (spaceIdx > 0)
                    {
                        // 有默认词性
                        var nat = p.Substring(spaceIdx + 1);    // 空格之后为词性
                        p      = p.Substring(0, spaceIdx);      //
                        defNat = NatureHelper.GetOrCreate(nat);
                    }
                    Load(p, defNat, dict);
                    //bool success =
                    //if(!success)
                    // log warning "loading file failed: " + p
                }
                if (dict.Count == 0)
                {
                    // log warning "no items loaded"
                    dict[Constants.TAG_OTHER] = null;   // 当作空白占位符
                }

                dat.Build(dict);

                SaveDat(Config.Custom_Dict_Path[0], dict);
                return(true);
            }
            catch (Exception e)
            {
                return(false);
            }
        }
Beispiel #14
0
        public void LoadTest()
        {
            var path = Path.Combine(Config.DataRootPath, _testFile);

            if (!File.Exists(path))
            {
                BuildTest();
            }

            DoubleArrayTrie <string> trie = new DoubleArrayTrie <string>();

            trie.Load(path, _mockData.Values.ToList());

            var res = trie.Get("测试key3");

            Assert.Equal(res, "测试value3");
        }
Beispiel #15
0
        public void BuildTest()
        {
            var path = Path.Combine(Config.DataRootPath, _testFile);

            if (File.Exists(path))
            {
                File.Delete(path);
            }

            DoubleArrayTrie <string> trie = new DoubleArrayTrie <string>();

            var errorCount = trie.Build(_mockData.Keys.ToList(), _mockData.Values.ToList());

            Assert.Equal(errorCount, 0);

            trie.Save(Path.Combine(Config.DataRootPath, _testFile));
        }
        /**
         * 生成一元词网
         *
         * @param wordNetStorage
         */
        protected void GenerateWordNet(WordNet wordNetStorage)
        {
            char[] charArray = wordNetStorage.charArray;

            // 核心词典查询
            DoubleArrayTrie <CoreDictionary.Attribute> .Searcher searcher = CoreDictionary.trie.getSearcher(charArray, 0);
            while (searcher.next())
            {
                wordNetStorage.add(searcher.begin + 1, new Vertex(new String(charArray, searcher.begin, searcher.length), searcher.value, searcher.index));
            }
            // 用户词典查询
            //        if (config.useCustomDictionary)
            //        {
            //            searcher = CustomDictionary.dat.getSearcher(charArray, 0);
            //            while (searcher.next())
            //            {
            //                wordNetStorage.add(searcher.begin + 1, new Vertex(new String(charArray, searcher.begin, searcher.length), searcher.value));
            //            }
            //        }
            // 原子分词,保证图连通
            //List<Vertex>[] vertexes = wordNetStorage.getVertexes();
            List <Vertex>[] vertexes = wordNetStorage.getVertexes();
            for (int i = 1; i < vertexes.Length;)
            {
                if (vertexes[i].Count == 0)
                {
                    int j = i + 1;
                    for (; j < vertexes.Length - 1; ++j)
                    {
                        if (!(vertexes[j].Count == 0))
                        {
                            break;
                        }
                    }
                    wordNetStorage.add(i, quickAtomSegment(charArray, i - 1, j - 1));
                    i = j;
                }
                else
                {
                    i += vertexes[i][vertexes[i].Count - 1].realWord.Length;
                }
            }
        }
Beispiel #17
0
        void TestSimpleTrie(bool compact)
        {
            var trie = MakeTrie();

            var doubleArrayTrie = new DoubleArrayTrie(compact);

            doubleArrayTrie.Build(trie);

            using (var ms = new MemoryStream())
            {
                doubleArrayTrie.Write(ms);

                ms.Seek(0, SeekOrigin.Begin);

                doubleArrayTrie = DoubleArrayTrie.Read(ms);
            }

            doubleArrayTrie.Lookup("a").Is(0);
            (doubleArrayTrie.Lookup("abc") > 0).IsTrue();
            (doubleArrayTrie.Lookup("あいう") > 0).IsTrue();
            (doubleArrayTrie.Lookup("xyz") < 0).IsTrue();
        }
        public static void Load(string path)
        {
            _trie = new DoubleArrayTrie <int>();
            var valueArr = LoadDat(path + ".value.dat");

            if (valueArr != null)
            {
                if (_trie.Load(path + ".trie.dat", valueArr))
                {
                    return;
                }
            }
            var map = new SortedDictionary <string, int>(StrComparer.Default);

            foreach (var line in File.ReadLines(path))
            {
                if (string.IsNullOrWhiteSpace(line))
                {
                    continue;
                }
                var segs = line.Split(new[] { ' ', '\t' }, StringSplitOptions.RemoveEmptyEntries);
                map[segs[0]] = int.Parse(segs[1]);
            }
            _trie = new DoubleArrayTrie <int>();
            _trie.Build(map);
            valueArr = new int[map.Count];
            int m = 0;

            foreach (var v in map.Values)
            {
                valueArr[m++] = v;
            }

            var fs = new FileStream(path + ".trie.dat", FileMode.Create, FileAccess.Write);

            _trie.Save(fs);
            fs.Close();
            SaveDat(path + ".value.dat", valueArr);
        }
 public DoubleArrayTrieSearcher(DoubleArrayTrie <T> arrayTrie)
 {
     _arrayTrie = arrayTrie;
 }
 public Searcher(char[] chars, DoubleArrayTrie <string> trie) : base(chars)
 {
     _trie = trie;
 }
Beispiel #21
0
 public Searcher(string text, DoubleArrayTrie <Pinyin[]> trie) : base(text)
 {
     _trie = trie;
 }
 protected Searcher(char[] c, DoubleArrayTrie <String> trie)
     : base(c)
 {
     this.trie = trie;
 }
Beispiel #23
0
 public Searcher(char[] cs, DoubleArrayTrie <V> trie) : base(cs)
 {
     _trie = trie;
 }
Beispiel #24
0
        /**
         * 使用用户词典合并粗分结果
         * @param vertexList 粗分结果
         * @return 合并后的结果
         */
        protected static LinkedList <Vertex> combineByCustomDictionary(LinkedList <Vertex> vertexList)
        {
            Vertex[] wordNet = vertexList.ToArray();
            // DAT合并
            DoubleArrayTrie <CoreDictionary.Attribute> dat = CustomDictionary.dat;

            for (int i = 0; i < wordNet.Length; ++i)
            {
                int state = 1;
                state = dat.transition(wordNet[i].realWord, state);
                if (state > 0)
                {
                    int start = i;
                    int to    = i + 1;
                    int end   = to;
                    //CoreDictionary.Attribute value = dat.output(state);
                    //for (; to < wordNet.Length; ++to)
                    //{
                    //    state = dat.transition(wordNet[to].realWord, state);
                    //    if (state < 0) break;
                    //    CoreDictionary.Attribute output = dat.output(state);
                    //    if (output != null)
                    //    {
                    //        value = output;
                    //        end = to + 1;
                    //    }
                    //}
                    //if (value != null)
                    //{
                    //    StringBuilder sbTerm = new StringBuilder();
                    //    for (int j = start; j < end; ++j)
                    //    {
                    //        sbTerm.Append(wordNet[j]);
                    //        wordNet[j] = null;
                    //    }
                    //    wordNet[i] = new Vertex(sbTerm.ToString(), value);
                    //    i = end - 1;
                    //}
                }
            }
            // BinTrie合并
            if (CustomDictionary.trie != null)
            {
                for (int i = 0; i < wordNet.Length; ++i)
                {
                    if (wordNet[i] == null)
                    {
                        continue;
                    }
                    BaseNode <CoreDictionary.Attribute> state = CustomDictionary.trie.transition(wordNet[i].realWord.ToCharArray(), 0);
                    if (state != null)
                    {
                        int start = i;
                        int to    = i + 1;
                        int end   = to;
                        CoreDictionary.Attribute value = state.getValue();
                        for (; to < wordNet.Length; ++to)
                        {
                            if (wordNet[to] == null)
                            {
                                continue;
                            }
                            state = state.transition(wordNet[to].realWord.ToCharArray(), 0);
                            if (state == null)
                            {
                                break;
                            }
                            if (state.getValue() != null)
                            {
                                value = state.getValue();
                                end   = to + 1;
                            }
                        }
                        if (value != null)
                        {
                            StringBuilder sbTerm = new StringBuilder();
                            for (int j = start; j < end; ++j)
                            {
                                if (wordNet[j] == null)
                                {
                                    continue;
                                }
                                sbTerm.Append(wordNet[j]);
                                wordNet[j] = null;
                            }
                            wordNet[i] = new Vertex(sbTerm.ToString(), value);
                            i          = end - 1;
                        }
                    }
                }
            }
            vertexList.Clear();
            foreach (Vertex vertex in wordNet)
            {
                if (vertex != null)
                {
                    vertexList.AddLast(vertex);
                }
            }
            return(vertexList);
        }
Beispiel #25
0
 public Searcher(string text, DoubleArrayTrie <V> trie) : base(text.ToCharArray())
 {
     _entries = new List <Tuple <string, V> >();
     _trie    = trie;
 }
        public static bool Load(string path)
        {
            try
            {
                _trie = new DoubleArrayTrie <AreaInfo>();
                var valueArr = LoadDat(path + ".value.dat");
                if (valueArr != null)
                {
                    if (_trie.Load(path + ".trie.dat", valueArr))
                    {
                        return(true);
                    }
                }
                // 读取txt文件
                var map = new SortedDictionary <string, AreaInfo>(StrComparer.Default);
                foreach (var line in File.ReadLines(path))
                {
                    if (string.IsNullOrWhiteSpace(line))
                    {
                        continue;
                    }
                    var segs = line.Split(new[] { ' ', '\t' }, StringSplitOptions.RemoveEmptyEntries);
                    var code = segs[0];
                    for (int i = 1; i < segs.Length; i++)
                    {
                        var name = segs[i];
                        if (Invalids.Contains(name))
                        {
                            continue;                            // 跳过无效地区名
                        }
                        if (name.Length == 2)
                        {
                            AddInMap(name, "", code, map);
                        }
                        else
                        {
                            var lastChar = name[name.Length - 1];
                            if ("市省县区州旗盟".Contains(lastChar))
                            {
                                AddInMap(name.Substring(0, name.Length - 1), lastChar.ToString(), code, map);
                            }
                            else if (name.Length < 9)
                            {
                                AddInMap(name, "", code, map);
                            }
                            var lastTwo = name.Substring(2);
                            var prevs   = name.Substring(0, name.Length - 2);
                            if (Invalids.Contains(lastTwo))
                            {
                                AddInMap(prevs, lastTwo, code, map);
                                if (prevs.Length == 3 && "市省".Contains(prevs[2]))
                                {
                                    AddInMap(name.Substring(0, 2), lastTwo, code, map);
                                }
                            }
                            if (lastChar == '旗')
                            {
                                var sublast = name[2];
                                if ("前后左中右特".Contains(sublast))
                                {
                                    AddInMap(prevs, "旗", code, map);
                                }
                            }
                            var subLastTwo = name.Substring(name.Length - 3, 2);
                            if (subLastTwo == "自治")
                            {
                                prevs = name.Substring(0, name.Length - 3);
                                var ends = name.Substring(name.Length - 3);
                                AddInMap(prevs, ends, code, map);
                                if (prevs.Length >= 4)
                                {
                                    for (int k = 2; k < prevs.Length - 1; k++)
                                    {
                                        if (k < prevs.Length - 3)
                                        {
                                            if (Nationalities.Contains(prevs.Substring(k, 4)))
                                            {
                                                AddInMap(prevs.Substring(0, k), ends, code, map);
                                                AddInMap(prevs.Substring(0, k) + "自治", lastChar.ToString(), code, map);
                                                break;
                                            }
                                        }
                                        if (k < prevs.Length - 2)
                                        {
                                            if (Nationalities.Contains(prevs.Substring(k, 3)))
                                            {
                                                AddInMap(prevs.Substring(0, k), ends, code, map);
                                                AddInMap(prevs.Substring(0, k) + "自治", lastChar.ToString(), code, map);
                                                break;
                                            }
                                        }
                                        if (Nationalities.Contains(prevs.Substring(k, 2)))
                                        {
                                            AddInMap(prevs.Substring(0, k), ends, code, map);
                                            AddInMap(prevs.Substring(0, k) + "自治", lastChar.ToString(), code, map);
                                            break;
                                        }
                                    }
                                }
                            }
                        }
                    }
                }

                _trie = new DoubleArrayTrie <AreaInfo>();
                _trie.Build(map);
                valueArr = new AreaInfo[map.Count];
                int m = 0;
                foreach (var v in map.Values)
                {
                    valueArr[m++] = v;
                }

                var fs = new FileStream(path + ".trie.dat", FileMode.Create, FileAccess.Write);
                _trie.Save(fs);
                fs.Close();
                SaveDat(path + ".value.dat", valueArr);
                return(true);
            }
            catch (Exception e)
            {
                return(false);
            }
        }
 public Searcher(char[] c, DoubleArrayTrie <char> trie)
     : base(c)
 {
     this.trie = trie;
 }
 public Searcher(String text, DoubleArrayTrie <char> trie)
     : base(text)
 {
     this.trie = trie;
 }
Beispiel #29
0
 public Searcher(char[] chars, DoubleArrayTrie <Pinyin[]> trie) : base(chars)
 {
     _trie = trie;
 }
 protected Searcher(String text, DoubleArrayTrie <String> trie)
     : base(text)
 {
     this.trie = trie;
 }