private static bool Load(string path) { _trie = new DoubleArrayTrie <string>(); if (LoadDat(path + ".bi" + Predefine.BIN_EXT)) { return(true); } var map = new SortedDictionary <string, string>(StrComparer.Default); foreach (var line in File.ReadLines(path)) { var param = line.Split(' '); if (param[0].EndsWith("@")) { continue; } var dependency = param[1]; map[param[0]] = dependency; } if (map.Count == 0) { return(false); } _trie.Build(map); if (!SaveDat(path, map)) { // log error } return(true); }
//public static BaseSearcher getSearcher(char[] charArray, DoubleArrayTrie<String> trie) //{ // return new Searcher(charArray, trie); //} protected static String segLongest(char[] charArray, DoubleArrayTrie <String> trie) { //StringBuilder sb = new StringBuilder(charArray.Length); //BaseSearcher searcher = getSearcher(charArray, trie); //KeyValuePair<String, String> entry; //int p = 0; // 当前处理到什么位置 //int offset; //while ((entry = searcher.next()) != null) //{ // offset = searcher.getOffset(); // // 补足没查到的词 // while (p < offset) // { // sb.Append(charArray[p]); // ++p; // } // sb.Append(entry.Value); // p = offset + entry.Key.Length; //} //// 补足没查到的词 //while (p < charArray.Length) //{ // sb.Append(charArray[p]); // ++p; //} //return sb.ToString(); return(null); }
public static string Seg4Longest(char[] chars, DoubleArrayTrie <string> trie) { var sb = new StringBuilder(chars.Length); var searcher = new Searcher(chars, trie); int pos = 0; int offset; Tuple <string, string> t; while ((t = searcher.Next()) != null) { offset = searcher.Offset; // 补足没有查到的词 if (pos < offset) { sb.Append(chars, pos, offset - pos); } sb.Append(t.Item2); pos = offset + t.Item2.Length; } if (pos < chars.Length) { sb.Append(chars, pos, chars.Length - pos); } return(sb.ToString()); }
void BuildTokenInfoDictionary(string inputDirname, string outputDirname, Encoding encoding, bool compactTries) { ProgressLog.Begin("compiling tokeninfo dict"); var tokenInfoCompiler = GetTokenInfoDictionaryCompiler(encoding); ProgressLog.Println("analyzing dictionary features"); using (var stream = tokenInfoCompiler.CombinedSequentialFileInputStream(inputDirname)) { tokenInfoCompiler.AnalyzeTokenInfo(stream); } ProgressLog.Println("reading tokeninfo"); using (var stream = tokenInfoCompiler.CombinedSequentialFileInputStream(inputDirname)) { tokenInfoCompiler.ReadTokenInfo(stream); } tokenInfoCompiler.Compile(); var surfaces = tokenInfoCompiler.Surfaces; ProgressLog.Begin("compiling double array trie"); using (var fs = new FileStream(Path.Combine(outputDirname, DoubleArrayTrie.DoubleArrayTrieFileName), FileMode.Create, FileAccess.ReadWrite)) { var trie = DoubleArrayTrieCompiler.Build(surfaces, compactTries); trie.Write(fs); } ProgressLog.Println("validating saved double array trie"); DoubleArrayTrie daTrie; using (var fs = new FileStream(Path.Combine(outputDirname, DoubleArrayTrie.DoubleArrayTrieFileName), FileMode.Open, FileAccess.Read)) { daTrie = DoubleArrayTrie.Read(fs); foreach (var surface in surfaces) { if (daTrie.Lookup(surface) < 0) { ProgressLog.Println("failed to look up [" + surface + "]"); } } } ProgressLog.End(); ProgressLog.Begin("processing target map"); for (var i = 0; i < surfaces.Count; i++) { int id = daTrie.Lookup(surfaces[i]); tokenInfoCompiler.AddMapping(id, i); } tokenInfoCompiler.Write(outputDirname); // TODO: Should be refactored -Christian ProgressLog.End(); ProgressLog.End(); }
bool ProcessIndex(ViterbiLattice lattice, int startIndex, string suffix) { var found = false; for (var endIndex = 1; endIndex < suffix.Length + 1; endIndex++) { var prefix = suffix.Substring(0, endIndex); var result = DoubleArrayTrie.Lookup(prefix); if (result > 0) { found = true; // Don't produce unknown word starting from this index foreach (var wordId in Dictionary.LookupWordIds(result)) { var node = new ViterbiNode(wordId, prefix, Dictionary, startIndex, ViterbiNode.NodeType.Known); lattice.AddNode(node, startIndex + 1, startIndex + 1 + endIndex); } } else if (result < 0) { // If result is less than zero, continue to next position break; } } return(found); }
protected internal virtual void LoadDictionaries() { DoubleArrayTrie = DoubleArrayTrie.NewInstance(Resolver); ConnectionCosts = ConnectionCosts.NewInstance(Resolver); TokenInfoDictionary = TokenInfoDictionary.NewInstance(Resolver); CharacterDefinitions = CharacterDefinitions.NewInstance(Resolver); UnknownDictionary = UnknownDictionary.NewInstance(Resolver, CharacterDefinitions, TotalFeatures); InsertedDictionary = new InsertedDictionary(TotalFeatures); }
public bool Load(string path) { _trie = new DoubleArrayTrie <V>(); var valueArr = OnLoadValue(path); if (valueArr == null) { // log info "" return(false); } if (LoadDat(path + ".trie.dat", valueArr)) { // log info "" return(true); } var keys = new List <string>(valueArr.Length); try { foreach (var line in File.ReadLines(path)) { if (string.IsNullOrWhiteSpace(line)) { continue; } var segs = line.Split(new[] { ' ', '\t' }, StringSplitOptions.RemoveEmptyEntries); keys.Add(segs[0]); } } catch (Exception e) { } var error = _trie.Build(keys, valueArr); if (error != 0) // 出错 { var map = new SortedDictionary <string, V>(StrComparer.Default); for (int i = 0; i < valueArr.Length; i++) { map[keys[i]] = valueArr[i]; } _trie = new DoubleArrayTrie <V>(); _trie.Build(map); int j = 0; foreach (var v in map.Values) { valueArr[j++] = v; } } var fs = new FileStream(path + ".trie.dat", FileMode.Create, FileAccess.Write); _trie.Save(fs); fs.Close(); OnSaveValue(valueArr, path); return(true); }
private static bool Load() { _trie = new DoubleArrayTrie <bool>(); if (LoadDat()) { return(true); } // 从原始字符串编码文件读取词典数据 try { var map = new SortedDictionary <string, bool>(StrComparer.Default); // 翻译人名,存在,则value为true var charFreqMap = new SortedDictionary <char, int>(); // 统计翻译人名中的各字符的频次 foreach (var line in File.ReadLines(Config.Translated_Person_Dict_Path)) { map[line] = true; foreach (var c in line) { if ("不赞".IndexOf(c) >= 0) { continue; // 排除一些不常用的字 } if (charFreqMap.TryGetValue(c, out int f)) { charFreqMap[c] = f + 1; } else { charFreqMap[c] = 1; } } } map["·"] = true; foreach (var p in charFreqMap) { if (p.Value < 10) { continue; // 如果单字符频次小于10,则忽略 } map[p.Key.ToString()] = true; // 否则视为一个名称的简称,认为是一个有效名 } _trie.Build(map); return(true); } catch (Exception e) { return(false); } }
public ViterbiBuilder(DoubleArrayTrie doubleArrayTrie, TokenInfoDictionary dictionary, UnknownDictionary unknownDictionary, UserDictionary userDictionary, TokenizerMode mode) { DoubleArrayTrie = doubleArrayTrie; Dictionary = dictionary; UnknownDictionary = unknownDictionary; UserDictionary = userDictionary; UseUserDictionary = userDictionary != null; SearchMode = mode == TokenizerMode.Search || mode == TokenizerMode.Extended; CharacterDefinitions = UnknownDictionary.CharacterDefinition; }
public static DoubleArrayTrie Build(List <string> surfaces, bool compact) { var trie = new Trie.Trie(); foreach (var surface in surfaces) { trie.Add(surface); } var doubleArrayTrie = new DoubleArrayTrie(compact); doubleArrayTrie.Build(trie); return(doubleArrayTrie); }
static CoreDictionary() { Trie = new DoubleArrayTrie <Attribute>(); var watch = Stopwatch.StartNew(); if (!Load(path)) { Predefine.logger.Error("核心词典" + path + "加载失败"); } else { watch.Stop(); Predefine.logger.Info(path + "加载成功," + Trie.Length + "个词条,耗时" + watch.ElapsedMilliseconds + "ms"); } }
public Searcher(int offset, char[] chars, DoubleArrayTrie <V> dat) { _dat = dat; charArr = chars; i = offset; last = dat._base[0]; if (chars.Length == 0) { begin = -1; } else { begin = offset; } }
private static bool Load() { if (LoadDat(Config.Custom_Dict_Path[0])) { return(true); } dat = new DoubleArrayTrie <WordAttr>(); var dict = new SortedDictionary <string, WordAttr>(StrComparer.Default); try { for (var i = 0; i < Config.Custom_Dict_Path.Length; i++) { var p = Config.Custom_Dict_Path[i]; // 当前自定义词典文件路径 var defNat = Nature.n; int spaceIdx = p.IndexOf(' '); if (spaceIdx > 0) { // 有默认词性 var nat = p.Substring(spaceIdx + 1); // 空格之后为词性 p = p.Substring(0, spaceIdx); // defNat = NatureHelper.GetOrCreate(nat); } Load(p, defNat, dict); //bool success = //if(!success) // log warning "loading file failed: " + p } if (dict.Count == 0) { // log warning "no items loaded" dict[Constants.TAG_OTHER] = null; // 当作空白占位符 } dat.Build(dict); SaveDat(Config.Custom_Dict_Path[0], dict); return(true); } catch (Exception e) { return(false); } }
public void LoadTest() { var path = Path.Combine(Config.DataRootPath, _testFile); if (!File.Exists(path)) { BuildTest(); } DoubleArrayTrie <string> trie = new DoubleArrayTrie <string>(); trie.Load(path, _mockData.Values.ToList()); var res = trie.Get("测试key3"); Assert.Equal(res, "测试value3"); }
public void BuildTest() { var path = Path.Combine(Config.DataRootPath, _testFile); if (File.Exists(path)) { File.Delete(path); } DoubleArrayTrie <string> trie = new DoubleArrayTrie <string>(); var errorCount = trie.Build(_mockData.Keys.ToList(), _mockData.Values.ToList()); Assert.Equal(errorCount, 0); trie.Save(Path.Combine(Config.DataRootPath, _testFile)); }
/** * 生成一元词网 * * @param wordNetStorage */ protected void GenerateWordNet(WordNet wordNetStorage) { char[] charArray = wordNetStorage.charArray; // 核心词典查询 DoubleArrayTrie <CoreDictionary.Attribute> .Searcher searcher = CoreDictionary.trie.getSearcher(charArray, 0); while (searcher.next()) { wordNetStorage.add(searcher.begin + 1, new Vertex(new String(charArray, searcher.begin, searcher.length), searcher.value, searcher.index)); } // 用户词典查询 // if (config.useCustomDictionary) // { // searcher = CustomDictionary.dat.getSearcher(charArray, 0); // while (searcher.next()) // { // wordNetStorage.add(searcher.begin + 1, new Vertex(new String(charArray, searcher.begin, searcher.length), searcher.value)); // } // } // 原子分词,保证图连通 //List<Vertex>[] vertexes = wordNetStorage.getVertexes(); List <Vertex>[] vertexes = wordNetStorage.getVertexes(); for (int i = 1; i < vertexes.Length;) { if (vertexes[i].Count == 0) { int j = i + 1; for (; j < vertexes.Length - 1; ++j) { if (!(vertexes[j].Count == 0)) { break; } } wordNetStorage.add(i, quickAtomSegment(charArray, i - 1, j - 1)); i = j; } else { i += vertexes[i][vertexes[i].Count - 1].realWord.Length; } } }
void TestSimpleTrie(bool compact) { var trie = MakeTrie(); var doubleArrayTrie = new DoubleArrayTrie(compact); doubleArrayTrie.Build(trie); using (var ms = new MemoryStream()) { doubleArrayTrie.Write(ms); ms.Seek(0, SeekOrigin.Begin); doubleArrayTrie = DoubleArrayTrie.Read(ms); } doubleArrayTrie.Lookup("a").Is(0); (doubleArrayTrie.Lookup("abc") > 0).IsTrue(); (doubleArrayTrie.Lookup("あいう") > 0).IsTrue(); (doubleArrayTrie.Lookup("xyz") < 0).IsTrue(); }
public static void Load(string path) { _trie = new DoubleArrayTrie <int>(); var valueArr = LoadDat(path + ".value.dat"); if (valueArr != null) { if (_trie.Load(path + ".trie.dat", valueArr)) { return; } } var map = new SortedDictionary <string, int>(StrComparer.Default); foreach (var line in File.ReadLines(path)) { if (string.IsNullOrWhiteSpace(line)) { continue; } var segs = line.Split(new[] { ' ', '\t' }, StringSplitOptions.RemoveEmptyEntries); map[segs[0]] = int.Parse(segs[1]); } _trie = new DoubleArrayTrie <int>(); _trie.Build(map); valueArr = new int[map.Count]; int m = 0; foreach (var v in map.Values) { valueArr[m++] = v; } var fs = new FileStream(path + ".trie.dat", FileMode.Create, FileAccess.Write); _trie.Save(fs); fs.Close(); SaveDat(path + ".value.dat", valueArr); }
public DoubleArrayTrieSearcher(DoubleArrayTrie <T> arrayTrie) { _arrayTrie = arrayTrie; }
public Searcher(char[] chars, DoubleArrayTrie <string> trie) : base(chars) { _trie = trie; }
public Searcher(string text, DoubleArrayTrie <Pinyin[]> trie) : base(text) { _trie = trie; }
protected Searcher(char[] c, DoubleArrayTrie <String> trie) : base(c) { this.trie = trie; }
public Searcher(char[] cs, DoubleArrayTrie <V> trie) : base(cs) { _trie = trie; }
/** * 使用用户词典合并粗分结果 * @param vertexList 粗分结果 * @return 合并后的结果 */ protected static LinkedList <Vertex> combineByCustomDictionary(LinkedList <Vertex> vertexList) { Vertex[] wordNet = vertexList.ToArray(); // DAT合并 DoubleArrayTrie <CoreDictionary.Attribute> dat = CustomDictionary.dat; for (int i = 0; i < wordNet.Length; ++i) { int state = 1; state = dat.transition(wordNet[i].realWord, state); if (state > 0) { int start = i; int to = i + 1; int end = to; //CoreDictionary.Attribute value = dat.output(state); //for (; to < wordNet.Length; ++to) //{ // state = dat.transition(wordNet[to].realWord, state); // if (state < 0) break; // CoreDictionary.Attribute output = dat.output(state); // if (output != null) // { // value = output; // end = to + 1; // } //} //if (value != null) //{ // StringBuilder sbTerm = new StringBuilder(); // for (int j = start; j < end; ++j) // { // sbTerm.Append(wordNet[j]); // wordNet[j] = null; // } // wordNet[i] = new Vertex(sbTerm.ToString(), value); // i = end - 1; //} } } // BinTrie合并 if (CustomDictionary.trie != null) { for (int i = 0; i < wordNet.Length; ++i) { if (wordNet[i] == null) { continue; } BaseNode <CoreDictionary.Attribute> state = CustomDictionary.trie.transition(wordNet[i].realWord.ToCharArray(), 0); if (state != null) { int start = i; int to = i + 1; int end = to; CoreDictionary.Attribute value = state.getValue(); for (; to < wordNet.Length; ++to) { if (wordNet[to] == null) { continue; } state = state.transition(wordNet[to].realWord.ToCharArray(), 0); if (state == null) { break; } if (state.getValue() != null) { value = state.getValue(); end = to + 1; } } if (value != null) { StringBuilder sbTerm = new StringBuilder(); for (int j = start; j < end; ++j) { if (wordNet[j] == null) { continue; } sbTerm.Append(wordNet[j]); wordNet[j] = null; } wordNet[i] = new Vertex(sbTerm.ToString(), value); i = end - 1; } } } } vertexList.Clear(); foreach (Vertex vertex in wordNet) { if (vertex != null) { vertexList.AddLast(vertex); } } return(vertexList); }
public Searcher(string text, DoubleArrayTrie <V> trie) : base(text.ToCharArray()) { _entries = new List <Tuple <string, V> >(); _trie = trie; }
public static bool Load(string path) { try { _trie = new DoubleArrayTrie <AreaInfo>(); var valueArr = LoadDat(path + ".value.dat"); if (valueArr != null) { if (_trie.Load(path + ".trie.dat", valueArr)) { return(true); } } // 读取txt文件 var map = new SortedDictionary <string, AreaInfo>(StrComparer.Default); foreach (var line in File.ReadLines(path)) { if (string.IsNullOrWhiteSpace(line)) { continue; } var segs = line.Split(new[] { ' ', '\t' }, StringSplitOptions.RemoveEmptyEntries); var code = segs[0]; for (int i = 1; i < segs.Length; i++) { var name = segs[i]; if (Invalids.Contains(name)) { continue; // 跳过无效地区名 } if (name.Length == 2) { AddInMap(name, "", code, map); } else { var lastChar = name[name.Length - 1]; if ("市省县区州旗盟".Contains(lastChar)) { AddInMap(name.Substring(0, name.Length - 1), lastChar.ToString(), code, map); } else if (name.Length < 9) { AddInMap(name, "", code, map); } var lastTwo = name.Substring(2); var prevs = name.Substring(0, name.Length - 2); if (Invalids.Contains(lastTwo)) { AddInMap(prevs, lastTwo, code, map); if (prevs.Length == 3 && "市省".Contains(prevs[2])) { AddInMap(name.Substring(0, 2), lastTwo, code, map); } } if (lastChar == '旗') { var sublast = name[2]; if ("前后左中右特".Contains(sublast)) { AddInMap(prevs, "旗", code, map); } } var subLastTwo = name.Substring(name.Length - 3, 2); if (subLastTwo == "自治") { prevs = name.Substring(0, name.Length - 3); var ends = name.Substring(name.Length - 3); AddInMap(prevs, ends, code, map); if (prevs.Length >= 4) { for (int k = 2; k < prevs.Length - 1; k++) { if (k < prevs.Length - 3) { if (Nationalities.Contains(prevs.Substring(k, 4))) { AddInMap(prevs.Substring(0, k), ends, code, map); AddInMap(prevs.Substring(0, k) + "自治", lastChar.ToString(), code, map); break; } } if (k < prevs.Length - 2) { if (Nationalities.Contains(prevs.Substring(k, 3))) { AddInMap(prevs.Substring(0, k), ends, code, map); AddInMap(prevs.Substring(0, k) + "自治", lastChar.ToString(), code, map); break; } } if (Nationalities.Contains(prevs.Substring(k, 2))) { AddInMap(prevs.Substring(0, k), ends, code, map); AddInMap(prevs.Substring(0, k) + "自治", lastChar.ToString(), code, map); break; } } } } } } } _trie = new DoubleArrayTrie <AreaInfo>(); _trie.Build(map); valueArr = new AreaInfo[map.Count]; int m = 0; foreach (var v in map.Values) { valueArr[m++] = v; } var fs = new FileStream(path + ".trie.dat", FileMode.Create, FileAccess.Write); _trie.Save(fs); fs.Close(); SaveDat(path + ".value.dat", valueArr); return(true); } catch (Exception e) { return(false); } }
public Searcher(char[] c, DoubleArrayTrie <char> trie) : base(c) { this.trie = trie; }
public Searcher(String text, DoubleArrayTrie <char> trie) : base(text) { this.trie = trie; }
public Searcher(char[] chars, DoubleArrayTrie <Pinyin[]> trie) : base(chars) { _trie = trie; }
protected Searcher(String text, DoubleArrayTrie <String> trie) : base(text) { this.trie = trie; }