/** * 获取某个单词的词频 * @param word * @return */ public static int getFrequency(String word) { CoreDictionary.Attribute attribute = getAttribute(word); if (attribute == null) { return(0); } return(attribute.totalFrequency); }
/** * 从HanLP的词库中提取某个单词的属性(包括核心词典和用户词典) * * @param word 单词 * @return 包含词性与频次的信息 */ public static CoreDictionary.Attribute getAttribute(String word) { CoreDictionary.Attribute attribute = CoreDictionary.get(word); if (attribute != null) { return(attribute); } return(CustomDictionary.get(word)); }
/** * 设置某个单词的属性 * @param word * @param natures * @return */ public static bool setAttribute(String word, params Nature[] natures) { if (natures == null) { return(false); } CoreDictionary.Attribute attribute = new CoreDictionary.Attribute(natures, new int[natures.Length]); Arrays.fill(attribute.frequency, 1); return(setAttribute(word, attribute)); }
/** * 从磁盘加载双数组 * * @param path * @return */ static bool loadDat(String path) { try { ByteArray byteArray = ByteArray.createByteArray(path + Predefine.BIN_EXT); if (byteArray == null) { return(false); } int size = byteArray.nextInt(); CoreDictionary.Attribute[] attributes = new CoreDictionary.Attribute[size]; final Nature[] natureIndexArray = Nature.values();
public Vertex(String word, String realWord, CoreDictionary.Attribute attribute, int wordID) { if (attribute == null) { attribute = new CoreDictionary.Attribute(Nature.n, 1); // 安全起见 } this.wordID = wordID; this.attribute = attribute; if (word == null) { word = compileRealWord(realWord, attribute); } //assert realWord.length() > 0 : "构造空白节点会导致死循环!"; this.word = word; this.realWord = realWord; }
/** * 将属性的词性锁定为nature * * @param nature 词性 * @return 如果锁定词性在词性列表中,返回真,否则返回假 */ public bool confirmNature(Nature nature) { if (attribute.nature.Length == 1 && attribute.nature[0] == nature) { return(true); } bool result = true; int frequency = attribute.getNatureFrequency(nature); if (frequency == 0) { frequency = 1000; result = false; } attribute = new CoreDictionary.Attribute(nature, frequency); return(result); }
/** * 设置某个单词的属性 * @param word * @param attribute * @return */ public static bool setAttribute(String word, CoreDictionary.Attribute attribute) { if (attribute == null) { return(false); } if (CoreDictionary.trie.set(word, attribute)) { return(true); } if (CustomDictionary.dat.set(word, attribute)) { return(true); } CustomDictionary.trie.put(word, attribute); return(true); }
/** * 将一个BufferedReader中的词条加载到词典 * @param br 源 * @param storage 储存位置 * @throws IOException 异常表示加载失败 */ public static void loadDictionary(StreamReader br, Dictionary<String, CoreDictionary.Attribute> storage) { String line; while ((line = br.ReadLine()) != null) { String[] param = line.Split(new string[] { "\\s" }, StringSplitOptions.None); int natureCount = (param.Length - 1) / 2; CoreDictionary.Attribute attribute = new CoreDictionary.Attribute(natureCount); //for (int i = 0; i < natureCount; ++i) //{ // attribute.nature[i] = Enum.valueOf(Nature.class, param[1 + 2 * i]); // attribute.frequency[i] = Integer.parseInt(param[2 + 2 * i]); // attribute.totalFrequency += attribute.frequency[i]; // } // storage.put(param[0], attribute); //} } //br.close(); }
/** * 最复杂的构造函数 * * @param word 编译后的词 * @param realWord 真实词 * @param attribute 属性 */ public Vertex(String word, String realWord, CoreDictionary.Attribute attribute) : this(word, realWord, attribute, -1) { }
public Vertex(char realWord, CoreDictionary.Attribute attribute) : this(realWord.ToString(), attribute) { }
public Vertex(String realWord, CoreDictionary.Attribute attribute, int wordID) : this(null, realWord, attribute, wordID) { }
/** * 真实词与编译词相同时候的构造函数 * * @param realWord * @param attribute */ public Vertex(String realWord, CoreDictionary.Attribute attribute) : this(null, realWord, attribute) { }
/** * 将原词转为等效词串 * @param realWord 原来的词 * @param attribute 等效词串 * @return */ private String compileRealWord(String realWord, CoreDictionary.Attribute attribute) { if (attribute.nature.Length == 1) { switch (attribute.nature[0]) { case Nature.nr: case Nature.nr1: case Nature.nr2: case Nature.nrf: case Nature.nrj: { wordID = CoreDictionary.NR_WORD_ID; // this.attribute = CoreDictionary.get(CoreDictionary.NR_WORD_ID); return(Predefine.TAG_PEOPLE); } case Nature.ns: case Nature.nsf: { wordID = CoreDictionary.NS_WORD_ID; // 在地名识别的时候,希望类似"河镇"的词语保持自己的词性,而不是未##地的词性 // this.attribute = CoreDictionary.get(CoreDictionary.NS_WORD_ID); return(Predefine.TAG_PLACE); } // case nz: case Nature.nx: { wordID = CoreDictionary.NX_WORD_ID; this.attribute = CoreDictionary.get(CoreDictionary.NX_WORD_ID); return(Predefine.TAG_PROPER); } case Nature.nt: case Nature.ntc: case Nature.ntcf: case Nature.ntcb: case Nature.ntch: case Nature.nto: case Nature.ntu: case Nature.nts: case Nature.nth: case Nature.nit: { wordID = CoreDictionary.NT_WORD_ID; this.attribute = CoreDictionary.get(CoreDictionary.NT_WORD_ID); return(Predefine.TAG_GROUP); } case Nature.m: case Nature.mq: { wordID = CoreDictionary.M_WORD_ID; this.attribute = CoreDictionary.get(CoreDictionary.M_WORD_ID); return(Predefine.TAG_NUMBER); } case Nature.x: { wordID = CoreDictionary.X_WORD_ID; this.attribute = CoreDictionary.get(CoreDictionary.X_WORD_ID); return(Predefine.TAG_CLUSTER); } // case xx: // case w: // { // word= Predefine.TAG_OTHER; // } // break; case Nature.t: { wordID = CoreDictionary.T_WORD_ID; this.attribute = CoreDictionary.get(CoreDictionary.T_WORD_ID); return(Predefine.TAG_TIME); } } } return(realWord); }
/** * 设置某个单词的属性 * @param word * @param natureWithFrequency * @return */ public static bool setAttribute(String word, String natureWithFrequency) { CoreDictionary.Attribute attribute = CoreDictionary.Attribute.create(natureWithFrequency); return(setAttribute(word, attribute)); }
/** * 使用用户词典合并粗分结果 * @param vertexList 粗分结果 * @return 合并后的结果 */ protected static LinkedList <Vertex> combineByCustomDictionary(LinkedList <Vertex> vertexList) { Vertex[] wordNet = vertexList.ToArray(); // DAT合并 DoubleArrayTrie <CoreDictionary.Attribute> dat = CustomDictionary.dat; for (int i = 0; i < wordNet.Length; ++i) { int state = 1; state = dat.transition(wordNet[i].realWord, state); if (state > 0) { int start = i; int to = i + 1; int end = to; //CoreDictionary.Attribute value = dat.output(state); //for (; to < wordNet.Length; ++to) //{ // state = dat.transition(wordNet[to].realWord, state); // if (state < 0) break; // CoreDictionary.Attribute output = dat.output(state); // if (output != null) // { // value = output; // end = to + 1; // } //} //if (value != null) //{ // StringBuilder sbTerm = new StringBuilder(); // for (int j = start; j < end; ++j) // { // sbTerm.Append(wordNet[j]); // wordNet[j] = null; // } // wordNet[i] = new Vertex(sbTerm.ToString(), value); // i = end - 1; //} } } // BinTrie合并 if (CustomDictionary.trie != null) { for (int i = 0; i < wordNet.Length; ++i) { if (wordNet[i] == null) { continue; } BaseNode <CoreDictionary.Attribute> state = CustomDictionary.trie.transition(wordNet[i].realWord.ToCharArray(), 0); if (state != null) { int start = i; int to = i + 1; int end = to; CoreDictionary.Attribute value = state.getValue(); for (; to < wordNet.Length; ++to) { if (wordNet[to] == null) { continue; } state = state.transition(wordNet[to].realWord.ToCharArray(), 0); if (state == null) { break; } if (state.getValue() != null) { value = state.getValue(); end = to + 1; } } if (value != null) { StringBuilder sbTerm = new StringBuilder(); for (int j = start; j < end; ++j) { if (wordNet[j] == null) { continue; } sbTerm.Append(wordNet[j]); wordNet[j] = null; } wordNet[i] = new Vertex(sbTerm.ToString(), value); i = end - 1; } } } } vertexList.Clear(); foreach (Vertex vertex in wordNet) { if (vertex != null) { vertexList.AddLast(vertex); } } return(vertexList); }