static int WordsLoading(string[] buffers, Dictionary <char, CharNode> dic)
        {
            CharNode cn    = null;
            int      count = 0;

            foreach (string line in buffers)
            {
                if (line.Length < 2)
                {
                    continue;
                }
                cn = null;
                if (dic.ContainsKey(line[0]))
                {
                    cn = dic[line[0]];
                }
                if (cn == null)
                {
                    cn = new CharNode();
                    dic.Add(line[0], cn);
                }
                ++count;
                cn.AddWordTail(tail(line));
            }
            return(count);
        }
 public int maxMatch(CharNode node, char[] sen, int offset)
 {
     if (node != null)
     {
         return(node.MaxMatch(sen, offset + 1));
     }
     return(0);
 }
Example #3
0
 protected void MaxMatch(CharNode[] cns, int cnIdx, char[] chs, int offset, List<int>[] tailLens, int tailLensIdx)
 {
     CharNode cn = null;
     if (offset < chs.Length)
         cn = dic.head(chs[offset]);
     cns[cnIdx] = cn;
     dic.maxMatch(cn, tailLens[tailLensIdx], chs, offset);
 }
Example #4
0
 /// <summary>
 /// 最大匹配,从chs[offset]开始匹配,同时把chs[offset]的字符终点保存在cns[cnIdx]
 /// </summary>
 /// <param name="cns"></param>
 /// <param name="cnIdx"></param>
 /// <param name="chs"></param>
 /// <param name="offset"></param>
 /// <returns>最大匹配到的词尾长,>0 找到</returns>
 protected int MaxMatch(CharNode[] cns, int cnIdx, char[] chs, int offset)
 {
     CharNode cn = null;
     if (offset < chs.Length)
         cn = dic.head(chs[offset]);
     cns[cnIdx] = cn;
     return dic.maxMatch(cn, chs, offset);
 }
 /// <summary>
 /// 没有数组的复制
 /// </summary>
 /// <param name="cn"></param>
 /// <param name="chs"></param>
 /// <param name="offset"></param>
 /// <param name="tailLen"></param>
 /// <returns></returns>
 protected int Search(CharNode cn, char[] chs, int offset, int tailLen)
 {
     if (tailLen == 0 || cn == null)
     {
         return(-1);
     }
     return(dic.search(cn, chs, offset, tailLen));
 }
 /// <summary>
 /// sen[offset]后tailLen长的词是否存在
 /// </summary>
 /// <param name="node"></param>
 /// <param name="sen"></param>
 /// <param name="offset"></param>
 /// <param name="tailLen"></param>
 /// <returns></returns>
 public int search(CharNode node, char[] sen, int offset, int tailLen)
 {
     if (node != null)
     {
         return(node.IndexOf(sen, offset, tailLen));
     }
     return(-1);
 }
 public List <int> maxMatch(CharNode node, List <int> tailLens, char[] sen, int offset)
 {
     tailLens.Clear();
     tailLens.Add(0);
     if (node != null)
     {
         return(node.MaxMatch(tailLens, sen, offset + 1));
     }
     return(tailLens);
 }
        /// <summary>
        /// 查找chs[offset]后面的tailLen个char是否为词
        /// </summary>
        /// <param name="chs"></param>
        /// <param name="offset"></param>
        /// <param name="tailLen"></param>
        /// <returns>返回chs[offset]字符结点下的词尾索引号,没找到返回-1</returns>
        protected int Search(char[] chs, int offset, int tailLen)
        {
            if (tailLen == 0)
            {
                return(-1);
            }
            CharNode cn = dic.head(chs[offset]);

            return(Search(cn, chs, offset, tailLen));
        }
        public int maxMatch(char[] sen, int offset)
        {
            CharNode node = null;

            if (dict.ContainsKey(sen[offset]))
            {
                node = dict[sen[offset]];
            }
            return(maxMatch(node, sen, offset));
        }
        protected void MaxMatch(CharNode[] cns, int cnIdx, char[] chs, int offset, List <int>[] tailLens, int tailLensIdx)
        {
            CharNode cn = null;

            if (offset < chs.Length)
            {
                cn = dic.head(chs[offset]);
            }
            cns[cnIdx] = cn;
            dic.maxMatch(cn, tailLens[tailLensIdx], chs, offset);
        }
        /// <summary>
        /// 最大匹配,从chs[offset]开始匹配,同时把chs[offset]的字符终点保存在cns[cnIdx]
        /// </summary>
        /// <param name="cns"></param>
        /// <param name="cnIdx"></param>
        /// <param name="chs"></param>
        /// <param name="offset"></param>
        /// <returns>最大匹配到的词尾长,>0 找到</returns>
        protected int MaxMatch(CharNode[] cns, int cnIdx, char[] chs, int offset)
        {
            CharNode cn = null;

            if (offset < chs.Length)
            {
                cn = dic.head(chs[offset]);
            }
            cns[cnIdx] = cn;
            return(dic.maxMatch(cn, chs, offset));
        }
        /// <summary>
        /// 文件总行数
        /// </summary>
        /// <param name="file"></param>
        /// <returns></returns>
        static int load(string[] buffers, Dictionary <char, CharNode> dic)
        {
            if (buffers == null)
            {
                return(0);
            }
            int n = 0;

            string[] w  = null;
            CharNode cn = null;

            foreach (string line in buffers)
            {
                if (string.IsNullOrEmpty(line))
                {
                    continue;
                }
                if (line.StartsWith("#"))
                {
                    continue;
                }
                cn = new CharNode();
                w  = line.Split(' ');
                if (w.Length == 2)
                {
                    try
                    {
                        //字频计算出自由度
                        cn.Freq = (int)(Math.Log(Int32.Parse(w[1])) * 100);
                    }
                    catch
                    {
                    }
                }

                if (!dic.ContainsKey(w[0][0]))
                {
                    dic.Add(w[0][0], cn);
                }
                else
                {
                    dic[w[0][0]] = cn;
                }
                ++n;
            }
            return(n);
        }
Example #13
0
        Chunk CreateChunk(Sentence sen, char[] chs, int[] tailLen, int[] offsets, CharNode[] cns)
        {
            Chunk ck = new Chunk();

            for (int i = 0; i < 3; i++)
            {
                if (offsets[i] < chs.Length)
                {
                    ck.Words[i] = new Word(chs, sen.StartOffset, offsets[i], tailLen[i] + 1);
                    if (tailLen[i] == 0) //单字的要取得"字频计算出自由度"
                    {
                        CharNode cn = cns[i];
                        if (cn != null)
                        {
                            ck.Words[i].Degree = cn.Freq;
                        }
                    }
                }
            }
            return(ck);
        }
 /// <summary>
 /// sen[offset]后tailLen长的词是否存在
 /// </summary>
 /// <param name="node"></param>
 /// <param name="sen"></param>
 /// <param name="offset"></param>
 /// <param name="tailLen"></param>
 /// <returns></returns>
 public int search(CharNode node, char[] sen, int offset, int tailLen)
 {
     if (node != null)
         return node.IndexOf(sen, offset, tailLen);
     return -1;
 }
Example #15
0
 /// <summary>
 /// 没有数组的复制
 /// </summary>
 /// <param name="cn"></param>
 /// <param name="chs"></param>
 /// <param name="offset"></param>
 /// <param name="tailLen"></param>
 /// <returns></returns>
 protected int Search(CharNode cn, char[] chs, int offset, int tailLen)
 {
     if (tailLen == 0 || cn == null) 
         return -1;
     return dic.search(cn, chs, offset, tailLen);
 }
        /// <summary>
        /// 文件总行数
        /// </summary>
        /// <param name="file"></param>
        /// <returns></returns>
        static int load(string[] buffers, Dictionary<char, CharNode> dic)
        {
            if (buffers == null) return 0;
            int n = 0;
            string[] w = null;
            CharNode cn = null;
            foreach (string line in buffers)
            {
                if (string.IsNullOrEmpty(line)) continue;
                if (line.StartsWith("#")) continue;
                cn = new CharNode();
                w = line.Split(' ');
                if (w.Length == 2)
                {
                    try
                    {
                        //字频计算出自由度
                        cn.Freq = (int)(Math.Log(Int32.Parse(w[1])) * 100);
                    }
                    catch
                    {
                    }
                }

                if (!dic.ContainsKey(w[0][0]))
                {

                    dic.Add(w[0][0], cn);
                }
                else
                    dic[w[0][0]] = cn;
                ++n;
            }
            return n;
        }
 public int maxMatch(CharNode node, char[] sen, int offset)
 {
     if (node != null)
         return node.MaxMatch(sen, offset + 1);
     return 0;
 }
        public override Chunk Segment(Sentence sen)
        {
            char[] chs = sen.Text;
            int[] tailLen = new int[3];//记录词的尾长
            List<int>[] tailLens = new List<int>[2];//记录词尾部允许的长度
            for (int i = 0; i < 2; i++)
            {
                tailLens[i] = new List<int>();
            }
            CharNode[] cns = new CharNode[3];

            //每个词在SEN的开始位置
            int[] offsets = new int[3];
            mmr.Reset();
            if (!sen.IsFinish)
            {
                if (showChunk)
                {
                    Console.WriteLine();
                }
                int maxLen = 0;
                offsets[0] = sen.Offset;
                //Console.WriteLine("{0}:{1}", sen.Offset, new String(sen.Text));
                /*
                 * 遍历所有不同词长,还不是从最大到0(w[0]=maxLen(chs,offsets[0]);w[0]>=0;w[0]--)
                 * 可以减少一部分多余的查找
                 */
                MaxMatch(cns, 0, chs, offsets[0], tailLens, 0);
                for (int aIdx = tailLens[0].Count - 1; aIdx >= 0; aIdx--)
                {
                    tailLen[0] = tailLens[0][aIdx];
                    //第二个词的开始位置
                    offsets[1] = offsets[0] + 1 + tailLen[0];
                    MaxMatch(cns, 1, chs, offsets[1], tailLens, 1);
                    for (int bIdx = tailLens[1].Count - 1; bIdx >= 0; bIdx--)
                    {
                        tailLen[1] = tailLens[1][bIdx];
                        offsets[2] = offsets[1] + 1 + tailLen[1];

                        //第三个词只需要最长的
                        tailLen[2] = MaxMatch(cns, 2, chs, offsets[2]);
                        int sumChunkLen = 0;
                        for (int i = 0; i < 3; i++)
                        {
                            sumChunkLen += tailLen[i] + 1;
                        }
                        Chunk ck = null;
                        if (sumChunkLen >= maxLen)
                        {
                            maxLen = sumChunkLen;
                            ck = CreateChunk(sen, chs, tailLen, offsets, cns);
                            mmr.AddChunk(ck);
                        }
                        if (showChunk)
                        {
                            if (ck == null)
                            {
                                ck = CreateChunk(sen, chs, tailLen, offsets, cns);
                                mmr.AddChunk(ck);
                            }
                            Console.WriteLine(ck);
                        }
                    }
                }
                //maxLen个字符已经处理完
                sen.AddOffset(maxLen);
                //Console.WriteLine("max:{0}", maxLen);
                List<Chunk> chunks = mmr.RemainChunks();
                foreach (Rule rule in otherRules)
                {
                    if (showChunk)
                    {
                        Console.WriteLine("---------filter before {0} -----------", rule);
                        PrintChunk(chunks);
                    }
                    if (chunks.Count <= 1)
                        break;

                    rule.Reset();
                    rule.AddChunks(chunks);
                    chunks = rule.RemainChunks();
                }
                if (showChunk)
                {
                    Console.WriteLine("------------remainChunks--------");
                    PrintChunk(chunks);
                }
                if (chunks.Count > 0)
                    return chunks[0];
            }

            return null;
        }
 public List<int> maxMatch(CharNode node, List<int> tailLens, char[] sen, int offset)
 {
     tailLens.Clear();
     tailLens.Add(0);
     if (node != null)
         return node.MaxMatch(tailLens, sen, offset + 1);
     return tailLens;
 }
Example #20
0
        public override Chunk Segment(Sentence sen)
        {
            char[]       chs      = sen.Text;
            int[]        tailLen  = new int[3];         //记录词的尾长
            List <int>[] tailLens = new List <int> [2]; //记录词尾部允许的长度
            for (int i = 0; i < 2; i++)
            {
                tailLens[i] = new List <int>();
            }
            CharNode[] cns = new CharNode[3];

            //每个词在SEN的开始位置
            int[] offsets = new int[3];
            mmr.Reset();
            if (!sen.IsFinish)
            {
                if (showChunk)
                {
                    Console.WriteLine();
                }
                int maxLen = 0;
                offsets[0] = sen.Offset;
                //Console.WriteLine("{0}:{1}", sen.Offset, new String(sen.Text));

                /*
                 * 遍历所有不同词长,还不是从最大到0(w[0]=maxLen(chs,offsets[0]);w[0]>=0;w[0]--)
                 * 可以减少一部分多余的查找
                 */
                MaxMatch(cns, 0, chs, offsets[0], tailLens, 0);
                for (int aIdx = tailLens[0].Count - 1; aIdx >= 0; aIdx--)
                {
                    tailLen[0] = tailLens[0][aIdx];
                    //第二个词的开始位置
                    offsets[1] = offsets[0] + 1 + tailLen[0];
                    MaxMatch(cns, 1, chs, offsets[1], tailLens, 1);
                    for (int bIdx = tailLens[1].Count - 1; bIdx >= 0; bIdx--)
                    {
                        tailLen[1] = tailLens[1][bIdx];
                        offsets[2] = offsets[1] + 1 + tailLen[1];

                        //第三个词只需要最长的
                        tailLen[2] = MaxMatch(cns, 2, chs, offsets[2]);
                        int sumChunkLen = 0;
                        for (int i = 0; i < 3; i++)
                        {
                            sumChunkLen += tailLen[i] + 1;
                        }
                        Chunk ck = null;
                        if (sumChunkLen >= maxLen)
                        {
                            maxLen = sumChunkLen;
                            ck     = CreateChunk(sen, chs, tailLen, offsets, cns);
                            mmr.AddChunk(ck);
                        }
                        if (showChunk)
                        {
                            if (ck == null)
                            {
                                ck = CreateChunk(sen, chs, tailLen, offsets, cns);
                                mmr.AddChunk(ck);
                            }
                            Console.WriteLine(ck);
                        }
                    }
                }
                //maxLen个字符已经处理完
                sen.AddOffset(maxLen);
                //Console.WriteLine("max:{0}", maxLen);
                List <Chunk> chunks = mmr.RemainChunks();
                foreach (Rule rule in otherRules)
                {
                    if (showChunk)
                    {
                        Console.WriteLine("---------filter before {0} -----------", rule);
                        PrintChunk(chunks);
                    }
                    if (chunks.Count <= 1)
                    {
                        break;
                    }

                    rule.Reset();
                    rule.AddChunks(chunks);
                    chunks = rule.RemainChunks();
                }
                if (showChunk)
                {
                    Console.WriteLine("------------remainChunks--------");
                    PrintChunk(chunks);
                }
                if (chunks.Count > 0)
                {
                    return(chunks[0]);
                }
            }

            return(null);
        }
 static int WordsLoading(string[] buffers, Dictionary<char, CharNode> dic)
 {
     CharNode cn = null;
     int count = 0;
     foreach (string line in buffers)
     {
         if(line.Length < 2) continue;
         cn = null;
         if (dic.ContainsKey(line[0]))
             cn = dic[line[0]];
         if (cn == null)
         {
             cn = new CharNode();
             dic.Add(line[0], cn);
         }
         ++count;
         cn.AddWordTail(tail(line));
     }
     return count;
 }
 Chunk CreateChunk(Sentence sen, char[] chs, int[] tailLen, int[] offsets, CharNode[] cns)
 {
     Chunk ck = new Chunk();
     for (int i = 0; i < 3; i++)
     {
         if (offsets[i] < chs.Length)
         {
             ck.Words[i] = new Word(chs, sen.StartOffset, offsets[i], tailLen[i] + 1);
             if (tailLen[i] == 0) //单字的要取得"字频计算出自由度"
             {
                 CharNode cn = cns[i];
                 if (cn != null)
                 {
                     ck.Words[i].Degree = cn.Freq;
                 }
             }
         }
     }
     return ck;
 }