public static bool MakeModel(string corpusPath, string modelSavePath) { var posSet = new SortedSet <string>(StrComparer.Default); var dm = new DictionaryMaker(); for (int i = 0; i < 2; i++) //! 执行两遍,无非是将label的频率从1增加到2 { foreach (var s in CoNLLUtil.LoadSentences(corpusPath)) { foreach (var w in s.word) { AddPair(w.NAME, w.HEAD.NAME, w.DEPREL, dm); AddPair(w.NAME, WrapTag(w.HEAD.POSTAG), w.DEPREL, dm); AddPair(WrapTag(w.POSTAG), w.HEAD.NAME, w.DEPREL, dm); AddPair(WrapTag(w.POSTAG), WrapTag(w.HEAD.POSTAG), w.DEPREL, dm); posSet.Add(w.POSTAG); } } } var sb = new StringBuilder(); foreach (var pos in posSet) { sb.Append("cases \"" + pos + "\":\n"); } File.WriteAllText(Config.Word_Nat_Weight_Model_Path, sb.ToString()); return(dm.SaveTxtTo(modelSavePath)); }
public CoNLLWord(int id, string lemma, string cpostag, string postag) { ID = id; LEMMA = lemma; CPOSTAG = cpostag; POSTAG = postag; NAME = CoNLLUtil.Compile(postag, lemma); }
public static void MakeModel(string corpusPath, string modelSavePath) { var sentences = CoNLLUtil.LoadSentences(corpusPath); var lines = new List <string>(sentences.Count * 30); var sb = new StringBuilder(); //int id = 1; for (int k = 0; k < sentences.Count; k++) { var s = sentences[k]; var edges = s.GetEdgeArr(); // 获取一句话中各词之间的依存关系(边) var word = s.GetWordArrWithRoot(); // 获取一句话中的所有单词,包括虚根节点 var size = edges.GetLength(0); for (int i = 0; i < size; i++) { for (int j = 0; j < size; j++) { if (i == j) { continue; // 词条与自身之间需要不考虑依存关系 } var contexts = new List <string>(); // 从i出发到达j的边,可能存在这样的依存关系,也可能不存在,此时使用NULL代替 contexts.AddRange(GenerateSingleWordContext(word, i, "i")); contexts.AddRange(GenerateSingleWordContext(word, j, "j")); contexts.AddRange(GenerateUniContext(word, i, j)); foreach (var c in contexts) { sb.Append(c).Append(' '); } sb.Append(edges[i, j]); lines.Add(sb.ToString()); sb.Clear(); } } } File.WriteAllLines(modelSavePath, lines.ToArray()); }