private string TokenToString(TaggedToken token, int index, bool plain) { if (plain) { return($"{token.Token.Value}\t{PosTagSet.GetTagName(token.PosTag)}"); } string[] pos = null; string neTag = null; string neType = null; if (token.PosTag >= 0) { pos = PosTagSet.GetTagName(token.PosTag).Split(new[] { '\\', '|' }, 2); } if (token.NeTag >= 0) { neTag = NeTagSet.GetTagName(token.NeTag); } if (token.NeTypeTag >= 0) { neType = NeTypeTagSet.GetTagName(token.NeTypeTag); } return($"{index + 1}\t{token.Token.Value}\t{token.Lemma ?? ""}\t{((pos == null) ? "_" : pos[0])}\t{((pos == null) ? "_" : pos[0])}\t{((pos == null || pos.Length < 2) ? "_" : pos[1])}\t_\t_\t_\t_\t{neTag ?? "_"}\t{neType ?? "_"}\t{token.Id ?? "_"}"); }
public TaggedToken[][] ReadConll(StreamReader reader, string fileId, bool extend, bool plain) { List <TaggedToken[]> sentences = new List <TaggedToken[]>(); List <TaggedToken> sentence = new List <TaggedToken>(); Tokenizer tokenizer; if (Language.Equals("sv")) { tokenizer = new SwedishTokenizer(new StringReader("")); } else if (Language.Equals("en")) { tokenizer = new EnglishTokenizer(new StringReader("")); } else if (Language.Equals("zh")) { tokenizer = null; } else { tokenizer = new LatinTokenizer(new StringReader("")); } string line; int sentenceIndex = 0; int tokenIndex = 0; while ((line = reader.ReadLine()) != null) { if (line.Equals("")) { if (sentence.Count > 0) { sentences.Add(sentence.ToArray()); sentence = new List <TaggedToken>(); sentenceIndex++; tokenIndex = 0; } continue; } if (line.StartsWith("#")) { continue; } string[] fields = plain ? Regex.Split(line, "\\s+") : line.Split('\t'); string posString = null; string neString = null; string neTypeString = null; string tokenId = null; string text; string lemma = null; int fieldsLength = fields.Length; if (plain) { if (fieldsLength < 1 || fieldsLength > 2) { throw new FormatException($"Expected 1 or 2 fields, found {fields.Length} in: {line}"); } text = fields[0]; if (fieldsLength == 2) { posString = fields[1]; } } else { if (fieldsLength < 6) { throw new FormatException($"Expected at least 6 fields, found {fields.Length} in: {line}"); } text = fields[1]; lemma = fields[2]; if (lemma.Equals("") || (lemma.Equals("_") && !text.Equals("_"))) { lemma = null; } if (!fields[3].Equals("_")) { if (!(fields[5].Equals("") || fields[5].Equals("_"))) { posString = fields[3] + "|" + fields[5]; } else { posString = fields[3]; } } if (fieldsLength >= 12 && !fields[10].Equals("_")) { neString = fields[10]; } if (fieldsLength >= 12 && !fields[11].Equals("_")) { neTypeString = fields[11]; } if (fieldsLength >= 13 && !fields[12].Equals("_")) { tokenId = fields[12]; } } if (text.Equals("")) { throw new FormatException($"Text field empty in: {line}"); } if (tokenId == null) { tokenId = $"{fileId}:{sentenceIndex}:{tokenIndex}"; } TaggedToken token; if (tokenizer == null) { token = new TaggedToken(new Token(TokenType.Unknown, text, 0), tokenId); } else { tokenizer.Reset(new StringReader(text)); Token subToken = tokenizer.Tokenize(); token = new TaggedToken(new Token(subToken.Type, text, 0), tokenId); } int posTag = -1, neTag = -1, neTypeTag = -1; try { if (posString != null) { posTag = PosTagSet.GetTagId(posString, extend); } if (neString != null) { if (neString.Equals("U")) { neString = "B"; } else if (neString.Equals("L")) { neString = "I"; } neTag = NeTagSet.GetTagId(neString, false); } } catch (TagNameException e) { Console.WriteLine(e); throw; } if (neTypeString != null) { neTypeTag = NeTypeTagSet.GetTagId(neTypeString, extend); } token.Lemma = lemma; token.PosTag = posTag; token.NeTag = neTag; token.NeTypeTag = neTypeTag; sentence.Add(token); tokenIndex++; } if (sentence.Count > 0) { sentences.Add(sentence.ToArray()); } return(sentences.Count == 0 ? null : sentences.ToArray()); }