/// <summary> /// 将所给字符串写入文件,每行一个字符,共3列,3个特征:【字特征、词性特征、词边界特征】 /// </summary> /// <param name="text">要写入文件的字符串</param> /// <param name="path">保存路径</param> public static void WriteSentenceToFile3(string text, string path) { var sw = File.CreateText(path); var words = Preprocessor.Cut(text); foreach (var item in words) { if (item.Word.Length == 1) { sw.WriteLine(item.Word[0] + "\t" + item.Flag + "\t" + "W"); } else { sw.WriteLine(item.Word[0] + "\t" + item.Flag + "\t" + "B"); //每个字一行 for (int i = 1; i < item.Word.Length - 1; i++) { sw.WriteLine(item.Word[i] + "\t" + item.Flag + "\t" + "M"); } sw.WriteLine(item.Word[item.Word.Length - 1] + "\t" + item.Flag + "\t" + "E"); } } /* * for (int i = 0; i < text.Length; i++) * { * sw.WriteLine(text[i]); * }*/ sw.Close(); }
/// <summary> /// 将一个或多个带有命名实体标注的anns格式的文件转为一个bio标注的文件(字特征、词性特征、词边界特征) /// </summary> /// <param name="bioFile">要保存的文件名</param> /// <param name="annsFiles">要转换的文件列表</param> public static void ConvertAnnsToBio3(string bioFile, params string[] annsFiles) { if (annsFiles == null) { return; } var fs = new FileStream(bioFile, FileMode.Create); var sw = new StreamWriter(fs, Encoding.UTF8); List <MicroBlogCalendar.Model.Pair> tokens = null; StreamReader sr; for (int j = 0; j < annsFiles.Length; j++) { sr = File.OpenText(annsFiles[j]); string line = null; while ((line = sr.ReadLine()) != null) { if (string.IsNullOrEmpty(line)) { sw.WriteLine(); continue; } var word = line.Split(); tokens = Preprocessor.Cut(word[0]); switch (word[1]) { case "S-Name": //专有名词 if (tokens.Count > 0) { //单个字组成词 if (tokens[0].Word.Length == 1) { sw.WriteLine(tokens[0].Word[0] + "\t" + tokens[0].Flag + "\t" + "W" + "\t" + "B-Name"); } else { sw.WriteLine(tokens[0].Word[0] + "\t" + tokens[0].Flag + "\t" + "B" + "\t" + "B-Name"); for (int i = 1; i < tokens[0].Word.Length - 1; i++) { sw.WriteLine(tokens[0].Word[i] + "\t" + tokens[0].Flag + "\t" + "M" + "\t" + "I-Name"); } sw.WriteLine(tokens[0].Word[tokens[0].Word.Length - 1] + "\t" + tokens[0].Flag + "\t" + "E" + "\t" + "I-Name"); } for (int i = 1; i < tokens.Count; i++) { if (tokens[i].Word.Length == 1) { sw.WriteLine(tokens[i].Word[0] + "\t" + tokens[i].Flag + "\t" + "W" + "\t" + "I-Name"); } else { sw.WriteLine(tokens[i].Word[0] + "\t" + tokens[i].Flag + "\t" + "B" + "\t" + "I-Name"); for (int k = 1; k < tokens[i].Word.Length - 1; k++) { sw.WriteLine(tokens[i].Word[k] + "\t" + tokens[i].Flag + "\t" + "M" + "\t" + "I-Name"); } sw.WriteLine(tokens[i].Word[tokens[i].Word.Length - 1] + "\t" + tokens[i].Flag + "\t" + "E" + "\t" + "I-Name"); } } } break; case "S-Person": //人名 if (tokens.Count > 0) { //单个字组成词 if (tokens[0].Word.Length == 1) { sw.WriteLine(tokens[0].Word[0] + "\t" + tokens[0].Flag + "\t" + "W" + "\t" + "B-Person"); } else { sw.WriteLine(tokens[0].Word[0] + "\t" + tokens[0].Flag + "\t" + "B" + "\t" + "B-Person"); for (int i = 1; i < tokens[0].Word.Length - 1; i++) { sw.WriteLine(tokens[0].Word[i] + "\t" + tokens[0].Flag + "\t" + "M" + "\t" + "I-Person"); } sw.WriteLine(tokens[0].Word[tokens[0].Word.Length - 1] + "\t" + tokens[0].Flag + "\t" + "E" + "\t" + "I-Person"); } for (int i = 1; i < tokens.Count; i++) { if (tokens[i].Word.Length == 1) { sw.WriteLine(tokens[i].Word[0] + "\t" + tokens[i].Flag + "\t" + "W" + "\t" + "I-Person"); } else { sw.WriteLine(tokens[i].Word[0] + "\t" + tokens[i].Flag + "\t" + "B" + "\t" + "I-Person"); for (int k = 1; k < tokens[i].Word.Length - 1; k++) { sw.WriteLine(tokens[i].Word[k] + "\t" + tokens[i].Flag + "\t" + "M" + "\t" + "I-Person"); } sw.WriteLine(tokens[i].Word[tokens[i].Word.Length - 1] + "\t" + tokens[i].Flag + "\t" + "E" + "\t" + "I-Person"); } } } break; case "S-Location": //地名 if (tokens.Count > 0) { //单个字组成词 if (tokens[0].Word.Length == 1) { sw.WriteLine(tokens[0].Word[0] + "\t" + tokens[0].Flag + "\t" + "W" + "\t" + "B-Location"); } else { sw.WriteLine(tokens[0].Word[0] + "\t" + tokens[0].Flag + "\t" + "B" + "\t" + "B-Location"); for (int i = 1; i < tokens[0].Word.Length - 1; i++) { sw.WriteLine(tokens[0].Word[i] + "\t" + tokens[0].Flag + "\t" + "M" + "\t" + "I-Location"); } sw.WriteLine(tokens[0].Word[tokens[0].Word.Length - 1] + "\t" + tokens[0].Flag + "\t" + "E" + "\t" + "I-Location"); } for (int i = 1; i < tokens.Count; i++) { if (tokens[i].Word.Length == 1) { sw.WriteLine(tokens[i].Word[0] + "\t" + tokens[i].Flag + "\t" + "W" + "\t" + "I-Location"); } else { sw.WriteLine(tokens[i].Word[0] + "\t" + tokens[i].Flag + "\t" + "B" + "\t" + "I-Location"); for (int k = 1; k < tokens[i].Word.Length - 1; k++) { sw.WriteLine(tokens[i].Word[k] + "\t" + tokens[i].Flag + "\t" + "M" + "\t" + "I-Location"); } sw.WriteLine(tokens[i].Word[tokens[i].Word.Length - 1] + "\t" + tokens[i].Flag + "\t" + "E" + "\t" + "I-Location"); } } } break; case "S-Organization": //机构名 if (tokens.Count > 0) { //单个字组成词 if (tokens[0].Word.Length == 1) { sw.WriteLine(tokens[0].Word[0] + "\t" + tokens[0].Flag + "\t" + "W" + "\t" + "B-Organization"); } else { sw.WriteLine(tokens[0].Word[0] + "\t" + tokens[0].Flag + "\t" + "B" + "\t" + "B-Organization"); for (int i = 1; i < tokens[0].Word.Length - 1; i++) { sw.WriteLine(tokens[0].Word[i] + "\t" + tokens[0].Flag + "\t" + "M" + "\t" + "I-Organization"); } sw.WriteLine(tokens[0].Word[tokens[0].Word.Length - 1] + "\t" + tokens[0].Flag + "\t" + "E" + "\t" + "I-Organization"); } for (int i = 1; i < tokens.Count; i++) { if (tokens[i].Word.Length == 1) { sw.WriteLine(tokens[i].Word[0] + "\t" + tokens[i].Flag + "\t" + "W" + "\t" + "I-Organization"); } else { sw.WriteLine(tokens[i].Word[0] + "\t" + tokens[i].Flag + "\t" + "B" + "\t" + "I-Organization"); for (int k = 1; k < tokens[i].Word.Length - 1; k++) { sw.WriteLine(tokens[i].Word[k] + "\t" + tokens[i].Flag + "\t" + "M" + "\t" + "I-Organization"); } sw.WriteLine(tokens[i].Word[tokens[i].Word.Length - 1] + "\t" + tokens[i].Flag + "\t" + "E" + "\t" + "I-Organization"); } } } break; case "S-Event": //事件 if (tokens.Count > 0) { //单个字组成词 if (tokens[0].Word.Length == 1) { sw.WriteLine(tokens[0].Word[0] + "\t" + tokens[0].Flag + "\t" + "W" + "\t" + "B-Event"); } else { sw.WriteLine(tokens[0].Word[0] + "\t" + tokens[0].Flag + "\t" + "B" + "\t" + "B-Event"); for (int i = 1; i < tokens[0].Word.Length - 1; i++) { sw.WriteLine(tokens[0].Word[i] + "\t" + tokens[0].Flag + "\t" + "M" + "\t" + "I-Event"); } sw.WriteLine(tokens[0].Word[tokens[0].Word.Length - 1] + "\t" + tokens[0].Flag + "\t" + "E" + "\t" + "I-Event"); } for (int i = 1; i < tokens.Count; i++) { if (tokens[i].Word.Length == 1) { sw.WriteLine(tokens[i].Word[0] + "\t" + tokens[i].Flag + "\t" + "W" + "\t" + "I-Event"); } else { sw.WriteLine(tokens[i].Word[0] + "\t" + tokens[i].Flag + "\t" + "B" + "\t" + "I-Event"); for (int k = 1; k < tokens[i].Word.Length - 1; k++) { sw.WriteLine(tokens[i].Word[k] + "\t" + tokens[i].Flag + "\t" + "M" + "\t" + "I-Event"); } sw.WriteLine(tokens[i].Word[tokens[i].Word.Length - 1] + "\t" + tokens[i].Flag + "\t" + "E" + "\t" + "I-Event"); } } } break; case "S-Count": if (tokens.Count > 0) { //单个字组成词 if (tokens[0].Word.Length == 1) { sw.WriteLine(tokens[0].Word[0] + "\t" + tokens[0].Flag + "\t" + "W" + "\t" + "B-Count"); } else { sw.WriteLine(tokens[0].Word[0] + "\t" + tokens[0].Flag + "\t" + "B" + "\t" + "B-Count"); for (int i = 1; i < tokens[0].Word.Length - 1; i++) { sw.WriteLine(tokens[0].Word[i] + "\t" + tokens[0].Flag + "\t" + "M" + "\t" + "I-Count"); } sw.WriteLine(tokens[0].Word[tokens[0].Word.Length - 1] + "\t" + tokens[0].Flag + "\t" + "E" + "\t" + "I-Count"); } for (int i = 1; i < tokens.Count; i++) { if (tokens[i].Word.Length == 1) { sw.WriteLine(tokens[i].Word[0] + "\t" + tokens[i].Flag + "\t" + "W" + "\t" + "I-Count"); } else { sw.WriteLine(tokens[i].Word[0] + "\t" + tokens[i].Flag + "\t" + "B" + "\t" + "I-Count"); for (int k = 1; k < tokens[i].Word.Length - 1; k++) { sw.WriteLine(tokens[i].Word[k] + "\t" + tokens[i].Flag + "\t" + "M" + "\t" + "I-Count"); } sw.WriteLine(tokens[i].Word[tokens[i].Word.Length - 1] + "\t" + tokens[i].Flag + "\t" + "E" + "\t" + "I-Count"); } } } break; case "S-Time": //日期时间 if (tokens.Count > 0) { //单个字组成词 if (tokens[0].Word.Length == 1) { sw.WriteLine(tokens[0].Word[0] + "\t" + tokens[0].Flag + "\t" + "W" + "\t" + "B-Time"); } else { sw.WriteLine(tokens[0].Word[0] + "\t" + tokens[0].Flag + "\t" + "B" + "\t" + "B-Time"); for (int i = 1; i < tokens[0].Word.Length - 1; i++) { sw.WriteLine(tokens[0].Word[i] + "\t" + tokens[0].Flag + "\t" + "M" + "\t" + "I-Time"); } sw.WriteLine(tokens[0].Word[tokens[0].Word.Length - 1] + "\t" + tokens[0].Flag + "\t" + "E" + "\t" + "I-Time"); } for (int i = 1; i < tokens.Count; i++) { if (tokens[i].Word.Length == 1) { sw.WriteLine(tokens[i].Word[0] + "\t" + tokens[i].Flag + "\t" + "W" + "\t" + "I-Time"); } else { sw.WriteLine(tokens[i].Word[0] + "\t" + tokens[i].Flag + "\t" + "B" + "\t" + "I-Time"); for (int k = 1; k < tokens[i].Word.Length - 1; k++) { sw.WriteLine(tokens[i].Word[k] + "\t" + tokens[i].Flag + "\t" + "M" + "\t" + "I-Time"); } sw.WriteLine(tokens[i].Word[tokens[i].Word.Length - 1] + "\t" + tokens[i].Flag + "\t" + "E" + "\t" + "I-Time"); } } } break; default: //非实体 if (tokens.Count > 0) { //单个字组成词 for (int i = 0; i < tokens.Count; i++) { if (tokens[i].Word.Length == 1) { sw.WriteLine(tokens[i].Word[0] + "\t" + tokens[i].Flag + "\t" + "W" + "\t" + "O"); } else { sw.WriteLine(tokens[i].Word[0] + "\t" + tokens[i].Flag + "\t" + "B" + "\t" + "O"); for (int k = 1; k < tokens[i].Word.Length - 1; k++) { sw.WriteLine(tokens[i].Word[k] + "\t" + tokens[i].Flag + "\t" + "M" + "\t" + "O"); } sw.WriteLine(tokens[i].Word[tokens[i].Word.Length - 1] + "\t" + tokens[i].Flag + "\t" + "E" + "\t" + "O"); } } } break; } } sr.Close(); } sw.Close(); fs.Close(); }