static int WordsLoading(string[] buffers, Dictionary <char, CharNode> dic) { CharNode cn = null; int count = 0; foreach (string line in buffers) { if (line.Length < 2) { continue; } cn = null; if (dic.ContainsKey(line[0])) { cn = dic[line[0]]; } if (cn == null) { cn = new CharNode(); dic.Add(line[0], cn); } ++count; cn.AddWordTail(tail(line)); } return(count); }
public int maxMatch(CharNode node, char[] sen, int offset) { if (node != null) { return(node.MaxMatch(sen, offset + 1)); } return(0); }
protected void MaxMatch(CharNode[] cns, int cnIdx, char[] chs, int offset, List<int>[] tailLens, int tailLensIdx) { CharNode cn = null; if (offset < chs.Length) cn = dic.head(chs[offset]); cns[cnIdx] = cn; dic.maxMatch(cn, tailLens[tailLensIdx], chs, offset); }
/// <summary> /// 最大匹配,从chs[offset]开始匹配,同时把chs[offset]的字符终点保存在cns[cnIdx] /// </summary> /// <param name="cns"></param> /// <param name="cnIdx"></param> /// <param name="chs"></param> /// <param name="offset"></param> /// <returns>最大匹配到的词尾长,>0 找到</returns> protected int MaxMatch(CharNode[] cns, int cnIdx, char[] chs, int offset) { CharNode cn = null; if (offset < chs.Length) cn = dic.head(chs[offset]); cns[cnIdx] = cn; return dic.maxMatch(cn, chs, offset); }
/// <summary> /// 没有数组的复制 /// </summary> /// <param name="cn"></param> /// <param name="chs"></param> /// <param name="offset"></param> /// <param name="tailLen"></param> /// <returns></returns> protected int Search(CharNode cn, char[] chs, int offset, int tailLen) { if (tailLen == 0 || cn == null) { return(-1); } return(dic.search(cn, chs, offset, tailLen)); }
/// <summary> /// sen[offset]后tailLen长的词是否存在 /// </summary> /// <param name="node"></param> /// <param name="sen"></param> /// <param name="offset"></param> /// <param name="tailLen"></param> /// <returns></returns> public int search(CharNode node, char[] sen, int offset, int tailLen) { if (node != null) { return(node.IndexOf(sen, offset, tailLen)); } return(-1); }
public List <int> maxMatch(CharNode node, List <int> tailLens, char[] sen, int offset) { tailLens.Clear(); tailLens.Add(0); if (node != null) { return(node.MaxMatch(tailLens, sen, offset + 1)); } return(tailLens); }
/// <summary> /// 查找chs[offset]后面的tailLen个char是否为词 /// </summary> /// <param name="chs"></param> /// <param name="offset"></param> /// <param name="tailLen"></param> /// <returns>返回chs[offset]字符结点下的词尾索引号,没找到返回-1</returns> protected int Search(char[] chs, int offset, int tailLen) { if (tailLen == 0) { return(-1); } CharNode cn = dic.head(chs[offset]); return(Search(cn, chs, offset, tailLen)); }
public int maxMatch(char[] sen, int offset) { CharNode node = null; if (dict.ContainsKey(sen[offset])) { node = dict[sen[offset]]; } return(maxMatch(node, sen, offset)); }
protected void MaxMatch(CharNode[] cns, int cnIdx, char[] chs, int offset, List <int>[] tailLens, int tailLensIdx) { CharNode cn = null; if (offset < chs.Length) { cn = dic.head(chs[offset]); } cns[cnIdx] = cn; dic.maxMatch(cn, tailLens[tailLensIdx], chs, offset); }
/// <summary> /// 最大匹配,从chs[offset]开始匹配,同时把chs[offset]的字符终点保存在cns[cnIdx] /// </summary> /// <param name="cns"></param> /// <param name="cnIdx"></param> /// <param name="chs"></param> /// <param name="offset"></param> /// <returns>最大匹配到的词尾长,>0 找到</returns> protected int MaxMatch(CharNode[] cns, int cnIdx, char[] chs, int offset) { CharNode cn = null; if (offset < chs.Length) { cn = dic.head(chs[offset]); } cns[cnIdx] = cn; return(dic.maxMatch(cn, chs, offset)); }
/// <summary> /// 文件总行数 /// </summary> /// <param name="file"></param> /// <returns></returns> static int load(string[] buffers, Dictionary <char, CharNode> dic) { if (buffers == null) { return(0); } int n = 0; string[] w = null; CharNode cn = null; foreach (string line in buffers) { if (string.IsNullOrEmpty(line)) { continue; } if (line.StartsWith("#")) { continue; } cn = new CharNode(); w = line.Split(' '); if (w.Length == 2) { try { //字频计算出自由度 cn.Freq = (int)(Math.Log(Int32.Parse(w[1])) * 100); } catch { } } if (!dic.ContainsKey(w[0][0])) { dic.Add(w[0][0], cn); } else { dic[w[0][0]] = cn; } ++n; } return(n); }
Chunk CreateChunk(Sentence sen, char[] chs, int[] tailLen, int[] offsets, CharNode[] cns) { Chunk ck = new Chunk(); for (int i = 0; i < 3; i++) { if (offsets[i] < chs.Length) { ck.Words[i] = new Word(chs, sen.StartOffset, offsets[i], tailLen[i] + 1); if (tailLen[i] == 0) //单字的要取得"字频计算出自由度" { CharNode cn = cns[i]; if (cn != null) { ck.Words[i].Degree = cn.Freq; } } } } return(ck); }
/// <summary> /// sen[offset]后tailLen长的词是否存在 /// </summary> /// <param name="node"></param> /// <param name="sen"></param> /// <param name="offset"></param> /// <param name="tailLen"></param> /// <returns></returns> public int search(CharNode node, char[] sen, int offset, int tailLen) { if (node != null) return node.IndexOf(sen, offset, tailLen); return -1; }
/// <summary> /// 没有数组的复制 /// </summary> /// <param name="cn"></param> /// <param name="chs"></param> /// <param name="offset"></param> /// <param name="tailLen"></param> /// <returns></returns> protected int Search(CharNode cn, char[] chs, int offset, int tailLen) { if (tailLen == 0 || cn == null) return -1; return dic.search(cn, chs, offset, tailLen); }
/// <summary> /// 文件总行数 /// </summary> /// <param name="file"></param> /// <returns></returns> static int load(string[] buffers, Dictionary<char, CharNode> dic) { if (buffers == null) return 0; int n = 0; string[] w = null; CharNode cn = null; foreach (string line in buffers) { if (string.IsNullOrEmpty(line)) continue; if (line.StartsWith("#")) continue; cn = new CharNode(); w = line.Split(' '); if (w.Length == 2) { try { //字频计算出自由度 cn.Freq = (int)(Math.Log(Int32.Parse(w[1])) * 100); } catch { } } if (!dic.ContainsKey(w[0][0])) { dic.Add(w[0][0], cn); } else dic[w[0][0]] = cn; ++n; } return n; }
public int maxMatch(CharNode node, char[] sen, int offset) { if (node != null) return node.MaxMatch(sen, offset + 1); return 0; }
public override Chunk Segment(Sentence sen) { char[] chs = sen.Text; int[] tailLen = new int[3];//记录词的尾长 List<int>[] tailLens = new List<int>[2];//记录词尾部允许的长度 for (int i = 0; i < 2; i++) { tailLens[i] = new List<int>(); } CharNode[] cns = new CharNode[3]; //每个词在SEN的开始位置 int[] offsets = new int[3]; mmr.Reset(); if (!sen.IsFinish) { if (showChunk) { Console.WriteLine(); } int maxLen = 0; offsets[0] = sen.Offset; //Console.WriteLine("{0}:{1}", sen.Offset, new String(sen.Text)); /* * 遍历所有不同词长,还不是从最大到0(w[0]=maxLen(chs,offsets[0]);w[0]>=0;w[0]--) * 可以减少一部分多余的查找 */ MaxMatch(cns, 0, chs, offsets[0], tailLens, 0); for (int aIdx = tailLens[0].Count - 1; aIdx >= 0; aIdx--) { tailLen[0] = tailLens[0][aIdx]; //第二个词的开始位置 offsets[1] = offsets[0] + 1 + tailLen[0]; MaxMatch(cns, 1, chs, offsets[1], tailLens, 1); for (int bIdx = tailLens[1].Count - 1; bIdx >= 0; bIdx--) { tailLen[1] = tailLens[1][bIdx]; offsets[2] = offsets[1] + 1 + tailLen[1]; //第三个词只需要最长的 tailLen[2] = MaxMatch(cns, 2, chs, offsets[2]); int sumChunkLen = 0; for (int i = 0; i < 3; i++) { sumChunkLen += tailLen[i] + 1; } Chunk ck = null; if (sumChunkLen >= maxLen) { maxLen = sumChunkLen; ck = CreateChunk(sen, chs, tailLen, offsets, cns); mmr.AddChunk(ck); } if (showChunk) { if (ck == null) { ck = CreateChunk(sen, chs, tailLen, offsets, cns); mmr.AddChunk(ck); } Console.WriteLine(ck); } } } //maxLen个字符已经处理完 sen.AddOffset(maxLen); //Console.WriteLine("max:{0}", maxLen); List<Chunk> chunks = mmr.RemainChunks(); foreach (Rule rule in otherRules) { if (showChunk) { Console.WriteLine("---------filter before {0} -----------", rule); PrintChunk(chunks); } if (chunks.Count <= 1) break; rule.Reset(); rule.AddChunks(chunks); chunks = rule.RemainChunks(); } if (showChunk) { Console.WriteLine("------------remainChunks--------"); PrintChunk(chunks); } if (chunks.Count > 0) return chunks[0]; } return null; }
public List<int> maxMatch(CharNode node, List<int> tailLens, char[] sen, int offset) { tailLens.Clear(); tailLens.Add(0); if (node != null) return node.MaxMatch(tailLens, sen, offset + 1); return tailLens; }
public override Chunk Segment(Sentence sen) { char[] chs = sen.Text; int[] tailLen = new int[3]; //记录词的尾长 List <int>[] tailLens = new List <int> [2]; //记录词尾部允许的长度 for (int i = 0; i < 2; i++) { tailLens[i] = new List <int>(); } CharNode[] cns = new CharNode[3]; //每个词在SEN的开始位置 int[] offsets = new int[3]; mmr.Reset(); if (!sen.IsFinish) { if (showChunk) { Console.WriteLine(); } int maxLen = 0; offsets[0] = sen.Offset; //Console.WriteLine("{0}:{1}", sen.Offset, new String(sen.Text)); /* * 遍历所有不同词长,还不是从最大到0(w[0]=maxLen(chs,offsets[0]);w[0]>=0;w[0]--) * 可以减少一部分多余的查找 */ MaxMatch(cns, 0, chs, offsets[0], tailLens, 0); for (int aIdx = tailLens[0].Count - 1; aIdx >= 0; aIdx--) { tailLen[0] = tailLens[0][aIdx]; //第二个词的开始位置 offsets[1] = offsets[0] + 1 + tailLen[0]; MaxMatch(cns, 1, chs, offsets[1], tailLens, 1); for (int bIdx = tailLens[1].Count - 1; bIdx >= 0; bIdx--) { tailLen[1] = tailLens[1][bIdx]; offsets[2] = offsets[1] + 1 + tailLen[1]; //第三个词只需要最长的 tailLen[2] = MaxMatch(cns, 2, chs, offsets[2]); int sumChunkLen = 0; for (int i = 0; i < 3; i++) { sumChunkLen += tailLen[i] + 1; } Chunk ck = null; if (sumChunkLen >= maxLen) { maxLen = sumChunkLen; ck = CreateChunk(sen, chs, tailLen, offsets, cns); mmr.AddChunk(ck); } if (showChunk) { if (ck == null) { ck = CreateChunk(sen, chs, tailLen, offsets, cns); mmr.AddChunk(ck); } Console.WriteLine(ck); } } } //maxLen个字符已经处理完 sen.AddOffset(maxLen); //Console.WriteLine("max:{0}", maxLen); List <Chunk> chunks = mmr.RemainChunks(); foreach (Rule rule in otherRules) { if (showChunk) { Console.WriteLine("---------filter before {0} -----------", rule); PrintChunk(chunks); } if (chunks.Count <= 1) { break; } rule.Reset(); rule.AddChunks(chunks); chunks = rule.RemainChunks(); } if (showChunk) { Console.WriteLine("------------remainChunks--------"); PrintChunk(chunks); } if (chunks.Count > 0) { return(chunks[0]); } } return(null); }
static int WordsLoading(string[] buffers, Dictionary<char, CharNode> dic) { CharNode cn = null; int count = 0; foreach (string line in buffers) { if(line.Length < 2) continue; cn = null; if (dic.ContainsKey(line[0])) cn = dic[line[0]]; if (cn == null) { cn = new CharNode(); dic.Add(line[0], cn); } ++count; cn.AddWordTail(tail(line)); } return count; }
Chunk CreateChunk(Sentence sen, char[] chs, int[] tailLen, int[] offsets, CharNode[] cns) { Chunk ck = new Chunk(); for (int i = 0; i < 3; i++) { if (offsets[i] < chs.Length) { ck.Words[i] = new Word(chs, sen.StartOffset, offsets[i], tailLen[i] + 1); if (tailLen[i] == 0) //单字的要取得"字频计算出自由度" { CharNode cn = cns[i]; if (cn != null) { ck.Words[i].Degree = cn.Freq; } } } } return ck; }