Exemplo n.º 1
0
        /// <summary>
        /// 将所给字符串写入文件,每行一个字符,共3列,3个特征:【字特征、词性特征、词边界特征】
        /// </summary>
        /// <param name="text">要写入文件的字符串</param>
        /// <param name="path">保存路径</param>
        public static void WriteSentenceToFile3(string text, string path)
        {
            var sw    = File.CreateText(path);
            var words = Preprocessor.Cut(text);

            foreach (var item in words)
            {
                if (item.Word.Length == 1)
                {
                    sw.WriteLine(item.Word[0] + "\t" + item.Flag + "\t" + "W");
                }
                else
                {
                    sw.WriteLine(item.Word[0] + "\t" + item.Flag + "\t" + "B");
                    //每个字一行
                    for (int i = 1; i < item.Word.Length - 1; i++)
                    {
                        sw.WriteLine(item.Word[i] + "\t" + item.Flag + "\t" + "M");
                    }
                    sw.WriteLine(item.Word[item.Word.Length - 1] + "\t" + item.Flag + "\t" + "E");
                }
            }

            /*
             * for (int i = 0; i < text.Length; i++)
             * {
             *  sw.WriteLine(text[i]);
             * }*/
            sw.Close();
        }
Exemplo n.º 2
0
        /// <summary>
        /// 将一个或多个带有命名实体标注的anns格式的文件转为一个bio标注的文件(字特征、词性特征、词边界特征)
        /// </summary>
        /// <param name="bioFile">要保存的文件名</param>
        /// <param name="annsFiles">要转换的文件列表</param>
        public static void ConvertAnnsToBio3(string bioFile, params string[] annsFiles)
        {
            if (annsFiles == null)
            {
                return;
            }
            var fs = new FileStream(bioFile, FileMode.Create);
            var sw = new StreamWriter(fs, Encoding.UTF8);
            List <MicroBlogCalendar.Model.Pair> tokens = null;

            StreamReader sr;

            for (int j = 0; j < annsFiles.Length; j++)
            {
                sr = File.OpenText(annsFiles[j]);
                string line = null;
                while ((line = sr.ReadLine()) != null)
                {
                    if (string.IsNullOrEmpty(line))
                    {
                        sw.WriteLine();
                        continue;
                    }
                    var word = line.Split();

                    tokens = Preprocessor.Cut(word[0]);
                    switch (word[1])
                    {
                    case "S-Name":    //专有名词

                        if (tokens.Count > 0)
                        {
                            //单个字组成词
                            if (tokens[0].Word.Length == 1)
                            {
                                sw.WriteLine(tokens[0].Word[0] + "\t" + tokens[0].Flag + "\t" + "W" + "\t" + "B-Name");
                            }
                            else
                            {
                                sw.WriteLine(tokens[0].Word[0] + "\t" + tokens[0].Flag + "\t" + "B" + "\t" + "B-Name");
                                for (int i = 1; i < tokens[0].Word.Length - 1; i++)
                                {
                                    sw.WriteLine(tokens[0].Word[i] + "\t" + tokens[0].Flag + "\t" + "M" + "\t" + "I-Name");
                                }
                                sw.WriteLine(tokens[0].Word[tokens[0].Word.Length - 1] + "\t" + tokens[0].Flag + "\t" + "E" + "\t" + "I-Name");
                            }

                            for (int i = 1; i < tokens.Count; i++)
                            {
                                if (tokens[i].Word.Length == 1)
                                {
                                    sw.WriteLine(tokens[i].Word[0] + "\t" + tokens[i].Flag + "\t" + "W" + "\t" + "I-Name");
                                }
                                else
                                {
                                    sw.WriteLine(tokens[i].Word[0] + "\t" + tokens[i].Flag + "\t" + "B" + "\t" + "I-Name");
                                    for (int k = 1; k < tokens[i].Word.Length - 1; k++)
                                    {
                                        sw.WriteLine(tokens[i].Word[k] + "\t" + tokens[i].Flag + "\t" + "M" + "\t" + "I-Name");
                                    }
                                    sw.WriteLine(tokens[i].Word[tokens[i].Word.Length - 1] + "\t" + tokens[i].Flag + "\t" + "E" + "\t" + "I-Name");
                                }
                            }
                        }


                        break;

                    case "S-Person":    //人名

                        if (tokens.Count > 0)
                        {
                            //单个字组成词
                            if (tokens[0].Word.Length == 1)
                            {
                                sw.WriteLine(tokens[0].Word[0] + "\t" + tokens[0].Flag + "\t" + "W" + "\t" + "B-Person");
                            }
                            else
                            {
                                sw.WriteLine(tokens[0].Word[0] + "\t" + tokens[0].Flag + "\t" + "B" + "\t" + "B-Person");
                                for (int i = 1; i < tokens[0].Word.Length - 1; i++)
                                {
                                    sw.WriteLine(tokens[0].Word[i] + "\t" + tokens[0].Flag + "\t" + "M" + "\t" + "I-Person");
                                }
                                sw.WriteLine(tokens[0].Word[tokens[0].Word.Length - 1] + "\t" + tokens[0].Flag + "\t" + "E" + "\t" + "I-Person");
                            }

                            for (int i = 1; i < tokens.Count; i++)
                            {
                                if (tokens[i].Word.Length == 1)
                                {
                                    sw.WriteLine(tokens[i].Word[0] + "\t" + tokens[i].Flag + "\t" + "W" + "\t" + "I-Person");
                                }
                                else
                                {
                                    sw.WriteLine(tokens[i].Word[0] + "\t" + tokens[i].Flag + "\t" + "B" + "\t" + "I-Person");
                                    for (int k = 1; k < tokens[i].Word.Length - 1; k++)
                                    {
                                        sw.WriteLine(tokens[i].Word[k] + "\t" + tokens[i].Flag + "\t" + "M" + "\t" + "I-Person");
                                    }
                                    sw.WriteLine(tokens[i].Word[tokens[i].Word.Length - 1] + "\t" + tokens[i].Flag + "\t" + "E" + "\t" + "I-Person");
                                }
                            }
                        }

                        break;

                    case "S-Location":    //地名

                        if (tokens.Count > 0)
                        {
                            //单个字组成词
                            if (tokens[0].Word.Length == 1)
                            {
                                sw.WriteLine(tokens[0].Word[0] + "\t" + tokens[0].Flag + "\t" + "W" + "\t" + "B-Location");
                            }
                            else
                            {
                                sw.WriteLine(tokens[0].Word[0] + "\t" + tokens[0].Flag + "\t" + "B" + "\t" + "B-Location");
                                for (int i = 1; i < tokens[0].Word.Length - 1; i++)
                                {
                                    sw.WriteLine(tokens[0].Word[i] + "\t" + tokens[0].Flag + "\t" + "M" + "\t" + "I-Location");
                                }
                                sw.WriteLine(tokens[0].Word[tokens[0].Word.Length - 1] + "\t" + tokens[0].Flag + "\t" + "E" + "\t" + "I-Location");
                            }

                            for (int i = 1; i < tokens.Count; i++)
                            {
                                if (tokens[i].Word.Length == 1)
                                {
                                    sw.WriteLine(tokens[i].Word[0] + "\t" + tokens[i].Flag + "\t" + "W" + "\t" + "I-Location");
                                }
                                else
                                {
                                    sw.WriteLine(tokens[i].Word[0] + "\t" + tokens[i].Flag + "\t" + "B" + "\t" + "I-Location");
                                    for (int k = 1; k < tokens[i].Word.Length - 1; k++)
                                    {
                                        sw.WriteLine(tokens[i].Word[k] + "\t" + tokens[i].Flag + "\t" + "M" + "\t" + "I-Location");
                                    }
                                    sw.WriteLine(tokens[i].Word[tokens[i].Word.Length - 1] + "\t" + tokens[i].Flag + "\t" + "E" + "\t" + "I-Location");
                                }
                            }
                        }

                        break;

                    case "S-Organization":    //机构名

                        if (tokens.Count > 0)
                        {
                            //单个字组成词
                            if (tokens[0].Word.Length == 1)
                            {
                                sw.WriteLine(tokens[0].Word[0] + "\t" + tokens[0].Flag + "\t" + "W" + "\t" + "B-Organization");
                            }
                            else
                            {
                                sw.WriteLine(tokens[0].Word[0] + "\t" + tokens[0].Flag + "\t" + "B" + "\t" + "B-Organization");
                                for (int i = 1; i < tokens[0].Word.Length - 1; i++)
                                {
                                    sw.WriteLine(tokens[0].Word[i] + "\t" + tokens[0].Flag + "\t" + "M" + "\t" + "I-Organization");
                                }
                                sw.WriteLine(tokens[0].Word[tokens[0].Word.Length - 1] + "\t" + tokens[0].Flag + "\t" + "E" + "\t" + "I-Organization");
                            }

                            for (int i = 1; i < tokens.Count; i++)
                            {
                                if (tokens[i].Word.Length == 1)
                                {
                                    sw.WriteLine(tokens[i].Word[0] + "\t" + tokens[i].Flag + "\t" + "W" + "\t" + "I-Organization");
                                }
                                else
                                {
                                    sw.WriteLine(tokens[i].Word[0] + "\t" + tokens[i].Flag + "\t" + "B" + "\t" + "I-Organization");
                                    for (int k = 1; k < tokens[i].Word.Length - 1; k++)
                                    {
                                        sw.WriteLine(tokens[i].Word[k] + "\t" + tokens[i].Flag + "\t" + "M" + "\t" + "I-Organization");
                                    }
                                    sw.WriteLine(tokens[i].Word[tokens[i].Word.Length - 1] + "\t" + tokens[i].Flag + "\t" + "E" + "\t" + "I-Organization");
                                }
                            }
                        }

                        break;

                    case "S-Event":    //事件

                        if (tokens.Count > 0)
                        {
                            //单个字组成词
                            if (tokens[0].Word.Length == 1)
                            {
                                sw.WriteLine(tokens[0].Word[0] + "\t" + tokens[0].Flag + "\t" + "W" + "\t" + "B-Event");
                            }
                            else
                            {
                                sw.WriteLine(tokens[0].Word[0] + "\t" + tokens[0].Flag + "\t" + "B" + "\t" + "B-Event");
                                for (int i = 1; i < tokens[0].Word.Length - 1; i++)
                                {
                                    sw.WriteLine(tokens[0].Word[i] + "\t" + tokens[0].Flag + "\t" + "M" + "\t" + "I-Event");
                                }
                                sw.WriteLine(tokens[0].Word[tokens[0].Word.Length - 1] + "\t" + tokens[0].Flag + "\t" + "E" + "\t" + "I-Event");
                            }

                            for (int i = 1; i < tokens.Count; i++)
                            {
                                if (tokens[i].Word.Length == 1)
                                {
                                    sw.WriteLine(tokens[i].Word[0] + "\t" + tokens[i].Flag + "\t" + "W" + "\t" + "I-Event");
                                }
                                else
                                {
                                    sw.WriteLine(tokens[i].Word[0] + "\t" + tokens[i].Flag + "\t" + "B" + "\t" + "I-Event");
                                    for (int k = 1; k < tokens[i].Word.Length - 1; k++)
                                    {
                                        sw.WriteLine(tokens[i].Word[k] + "\t" + tokens[i].Flag + "\t" + "M" + "\t" + "I-Event");
                                    }
                                    sw.WriteLine(tokens[i].Word[tokens[i].Word.Length - 1] + "\t" + tokens[i].Flag + "\t" + "E" + "\t" + "I-Event");
                                }
                            }
                        }
                        break;

                    case "S-Count":

                        if (tokens.Count > 0)
                        {
                            //单个字组成词
                            if (tokens[0].Word.Length == 1)
                            {
                                sw.WriteLine(tokens[0].Word[0] + "\t" + tokens[0].Flag + "\t" + "W" + "\t" + "B-Count");
                            }
                            else
                            {
                                sw.WriteLine(tokens[0].Word[0] + "\t" + tokens[0].Flag + "\t" + "B" + "\t" + "B-Count");
                                for (int i = 1; i < tokens[0].Word.Length - 1; i++)
                                {
                                    sw.WriteLine(tokens[0].Word[i] + "\t" + tokens[0].Flag + "\t" + "M" + "\t" + "I-Count");
                                }
                                sw.WriteLine(tokens[0].Word[tokens[0].Word.Length - 1] + "\t" + tokens[0].Flag + "\t" + "E" + "\t" + "I-Count");
                            }

                            for (int i = 1; i < tokens.Count; i++)
                            {
                                if (tokens[i].Word.Length == 1)
                                {
                                    sw.WriteLine(tokens[i].Word[0] + "\t" + tokens[i].Flag + "\t" + "W" + "\t" + "I-Count");
                                }
                                else
                                {
                                    sw.WriteLine(tokens[i].Word[0] + "\t" + tokens[i].Flag + "\t" + "B" + "\t" + "I-Count");
                                    for (int k = 1; k < tokens[i].Word.Length - 1; k++)
                                    {
                                        sw.WriteLine(tokens[i].Word[k] + "\t" + tokens[i].Flag + "\t" + "M" + "\t" + "I-Count");
                                    }
                                    sw.WriteLine(tokens[i].Word[tokens[i].Word.Length - 1] + "\t" + tokens[i].Flag + "\t" + "E" + "\t" + "I-Count");
                                }
                            }
                        }
                        break;

                    case "S-Time":    //日期时间
                        if (tokens.Count > 0)
                        {
                            //单个字组成词
                            if (tokens[0].Word.Length == 1)
                            {
                                sw.WriteLine(tokens[0].Word[0] + "\t" + tokens[0].Flag + "\t" + "W" + "\t" + "B-Time");
                            }
                            else
                            {
                                sw.WriteLine(tokens[0].Word[0] + "\t" + tokens[0].Flag + "\t" + "B" + "\t" + "B-Time");
                                for (int i = 1; i < tokens[0].Word.Length - 1; i++)
                                {
                                    sw.WriteLine(tokens[0].Word[i] + "\t" + tokens[0].Flag + "\t" + "M" + "\t" + "I-Time");
                                }
                                sw.WriteLine(tokens[0].Word[tokens[0].Word.Length - 1] + "\t" + tokens[0].Flag + "\t" + "E" + "\t" + "I-Time");
                            }

                            for (int i = 1; i < tokens.Count; i++)
                            {
                                if (tokens[i].Word.Length == 1)
                                {
                                    sw.WriteLine(tokens[i].Word[0] + "\t" + tokens[i].Flag + "\t" + "W" + "\t" + "I-Time");
                                }
                                else
                                {
                                    sw.WriteLine(tokens[i].Word[0] + "\t" + tokens[i].Flag + "\t" + "B" + "\t" + "I-Time");
                                    for (int k = 1; k < tokens[i].Word.Length - 1; k++)
                                    {
                                        sw.WriteLine(tokens[i].Word[k] + "\t" + tokens[i].Flag + "\t" + "M" + "\t" + "I-Time");
                                    }
                                    sw.WriteLine(tokens[i].Word[tokens[i].Word.Length - 1] + "\t" + tokens[i].Flag + "\t" + "E" + "\t" + "I-Time");
                                }
                            }
                        }
                        break;

                    default:    //非实体
                        if (tokens.Count > 0)
                        {
                            //单个字组成词

                            for (int i = 0; i < tokens.Count; i++)
                            {
                                if (tokens[i].Word.Length == 1)
                                {
                                    sw.WriteLine(tokens[i].Word[0] + "\t" + tokens[i].Flag + "\t" + "W" + "\t" + "O");
                                }
                                else
                                {
                                    sw.WriteLine(tokens[i].Word[0] + "\t" + tokens[i].Flag + "\t" + "B" + "\t" + "O");
                                    for (int k = 1; k < tokens[i].Word.Length - 1; k++)
                                    {
                                        sw.WriteLine(tokens[i].Word[k] + "\t" + tokens[i].Flag + "\t" + "M" + "\t" + "O");
                                    }
                                    sw.WriteLine(tokens[i].Word[tokens[i].Word.Length - 1] + "\t" + tokens[i].Flag + "\t" + "E" + "\t" + "O");
                                }
                            }
                        }
                        break;
                    }
                }

                sr.Close();
            }
            sw.Close();
            fs.Close();
        }