public virtual StringList read() { string line = lineStream.read(); StringList name = null; if ((line != null) && (!StringUtil.isEmpty(line))) { string name2; // find the location of the name separator in the line of data. int pos = line.IndexOf(' '); if ((pos != -1)) { string parsed = line.Substring(0, pos); // the data is in ALL CAPS ... so the easiest way is to convert // back to standard mixed case. if ((parsed.Length > 2) && (parsed.StartsWith("MC", StringComparison.Ordinal))) { name2 = parsed.Substring(0, 1).ToUpper(locale) + parsed.Substring(1, 1).ToLower(locale) + parsed.Substring(2, 1).ToUpper(locale) + parsed.Substring(3).ToLower(locale); } else { name2 = parsed.Substring(0, 1).ToUpper(locale) + parsed.Substring(1).ToLower(locale); } name = new StringList(new string[] { name2 }); } } return(name); }
/// <summary> /// Creates a dictionary. /// </summary> /// <param name="sampleStream"> stream of samples. </param> /// <returns> a {@code Dictionary} class containing the name dictionary /// built from the input file. </returns> /// <exception cref="IOException"> IOException </exception> public static Dictionary createDictionary(ObjectStream <StringList> sampleStream) { Dictionary mNameDictionary = new Dictionary(true); StringList entry; entry = sampleStream.read(); while (entry != null) { if (!mNameDictionary.contains(entry)) { mNameDictionary.put(entry); } entry = sampleStream.read(); } return(mNameDictionary); }
public override void run(string format, string[] args) { if (0 == args.Length) { Console.WriteLine(Help); } else { format = args[0]; ObjectStreamFactory streamFactory = getStreamFactory(format); string[] formatArgs = new string[args.Length - 1]; Array.Copy(args, 1, formatArgs, 0, formatArgs.Length); string helpString = createHelpString(format, ArgumentParser.createUsage(streamFactory.Parameters)); if (0 == formatArgs.Length || (1 == formatArgs.Length && "help".Equals(formatArgs[0]))) { Console.WriteLine(helpString); Environment.Exit(0); } string errorMessage = ArgumentParser.validateArgumentsLoudly(formatArgs, streamFactory.Parameters); if (null != errorMessage) { throw new TerminateToolException(1, errorMessage + "\n" + helpString); } ObjectStream <T> sampleStream = streamFactory.create(formatArgs); try { object sample; while ((sample = sampleStream.read()) != null) { Console.WriteLine(sample.ToString()); } } catch (IOException e) { throw new TerminateToolException(-1, "IO error while converting data : " + e.Message, e); } finally { if (sampleStream != null) { try { sampleStream.close(); } catch (IOException) { // sorry that this can fail } } } } }
public virtual NameSample read() { IList <string> sentence = new List <string>(); IList <string> tags = new List <string>(); bool isClearAdaptiveData = false; // Empty line indicates end of sentence string line; while ((line = lineStream.read()) != null && !StringUtil.isEmpty(line)) { if (LANGUAGE.NL.Equals(lang) && line.StartsWith(DOCSTART, StringComparison.Ordinal)) { isClearAdaptiveData = true; continue; } string[] fields = line.Split(' '); if (fields.Length == 3) { sentence.Add(fields[0]); tags.Add(fields[2]); } else { throw new IOException("Expected three fields per line in training data, got " + fields.Length + " for line '" + line + "'!"); } } // Always clear adaptive data for spanish if (LANGUAGE.ES.Equals(lang)) { isClearAdaptiveData = true; } if (sentence.Count > 0) { // convert name tags into spans IList <Span> names = new List <Span>(); int beginIndex = -1; int endIndex = -1; for (int i = 0; i < tags.Count; i++) { string tag = tags[i]; if (tag.EndsWith("PER", StringComparison.Ordinal) && (types & GENERATE_PERSON_ENTITIES) == 0) { tag = "O"; } if (tag.EndsWith("ORG", StringComparison.Ordinal) && (types & GENERATE_ORGANIZATION_ENTITIES) == 0) { tag = "O"; } if (tag.EndsWith("LOC", StringComparison.Ordinal) && (types & GENERATE_LOCATION_ENTITIES) == 0) { tag = "O"; } if (tag.EndsWith("MISC", StringComparison.Ordinal) && (types & GENERATE_MISC_ENTITIES) == 0) { tag = "O"; } if (tag.StartsWith("B-", StringComparison.Ordinal)) { if (beginIndex != -1) { names.Add(extract(beginIndex, endIndex, tags[beginIndex])); beginIndex = -1; endIndex = -1; } beginIndex = i; endIndex = i + 1; } else if (tag.StartsWith("I-", StringComparison.Ordinal)) { endIndex++; } else if (tag.Equals("O")) { if (beginIndex != -1) { names.Add(extract(beginIndex, endIndex, tags[beginIndex])); beginIndex = -1; endIndex = -1; } } else { throw new IOException("Invalid tag: " + tag); } } // if one span remains, create it here if (beginIndex != -1) { names.Add(extract(beginIndex, endIndex, tags[beginIndex])); } return(new NameSample(sentence.ToArray(), names.ToArray(), isClearAdaptiveData)); } else if (line != null) { // Just filter out empty events, if two lines in a row are empty return(read()); } else { // source stream is not returning anymore lines return(null); } }
public virtual NameSample read() { IList <string> sentence = new List <string>(); IList <string> tags = new List <string>(); bool isClearAdaptiveData = false; // Empty line indicates end of sentence string line; while ((line = lineStream.read()) != null && !StringUtil.isEmpty(line)) { if (line.StartsWith(Conll02NameSampleStream.DOCSTART, StringComparison.Ordinal)) { isClearAdaptiveData = true; string emptyLine = lineStream.read(); if (!StringUtil.isEmpty(emptyLine)) { throw new IOException("Empty line after -DOCSTART- not empty: '" + emptyLine + "'!"); } continue; } string[] fields = line.Split(" ", true); // For English: WORD POS-TAG SC-TAG NE-TAG if (LANGUAGE.EN.Equals(lang) && (fields.Length == 4)) { sentence.Add(fields[0]); tags.Add(fields[3]); // 3 is NE-TAG } // For German: WORD LEMA-TAG POS-TAG SC-TAG NE-TAG else if (LANGUAGE.DE.Equals(lang) && (fields.Length == 5)) { sentence.Add(fields[0]); tags.Add(fields[4]); // 4 is NE-TAG } else { throw new IOException("Incorrect number of fields per line for language: '" + line + "'!"); } } if (sentence.Count > 0) { // convert name tags into spans IList <Span> names = new List <Span>(); int beginIndex = -1; int endIndex = -1; for (int i = 0; i < tags.Count; i++) { string tag = tags[i]; if (tag.EndsWith("PER", StringComparison.Ordinal) && (types & Conll02NameSampleStream.GENERATE_PERSON_ENTITIES) == 0) { tag = "O"; } if (tag.EndsWith("ORG", StringComparison.Ordinal) && (types & Conll02NameSampleStream.GENERATE_ORGANIZATION_ENTITIES) == 0) { tag = "O"; } if (tag.EndsWith("LOC", StringComparison.Ordinal) && (types & Conll02NameSampleStream.GENERATE_LOCATION_ENTITIES) == 0) { tag = "O"; } if (tag.EndsWith("MISC", StringComparison.Ordinal) && (types & Conll02NameSampleStream.GENERATE_MISC_ENTITIES) == 0) { tag = "O"; } if (tag.Equals("O")) { // O means we don't have anything this round. if (beginIndex != -1) { names.Add(extract(beginIndex, endIndex, tags[beginIndex])); beginIndex = -1; endIndex = -1; } } else if (tag.StartsWith("B-", StringComparison.Ordinal)) { // B- prefix means we have two same entities next to each other if (beginIndex != -1) { names.Add(extract(beginIndex, endIndex, tags[beginIndex])); } beginIndex = i; endIndex = i + 1; } else if (tag.StartsWith("I-", StringComparison.Ordinal)) { // I- starts or continues a current name entity if (beginIndex == -1) { beginIndex = i; endIndex = i + 1; } else if (!tag.EndsWith(tags[beginIndex].Substring(1), StringComparison.Ordinal)) { // we have a new tag type following a tagged word series // also may not have the same I- starting the previous! names.Add(extract(beginIndex, endIndex, tags[beginIndex])); beginIndex = i; endIndex = i + 1; } else { endIndex++; } } else { throw new IOException("Invalid tag: " + tag); } } // if one span remains, create it here if (beginIndex != -1) { names.Add(extract(beginIndex, endIndex, tags[beginIndex])); } return(new NameSample(sentence.ToArray(), names.ToArray(), isClearAdaptiveData)); } else if (line != null) { // Just filter out empty events, if two lines in a row are empty return(read()); } else { // source stream is not returning anymore lines return(null); } }
public virtual NameSample read() { IList <string> sentence = new List <string>(); IList <string> tags = new List <string>(); bool isClearAdaptiveData = false; // Empty line indicates end of sentence string line; while ((line = lineStream.read()) != null && !StringUtil.isEmpty(line.Trim())) { if (line.StartsWith("###MEDLINE:", StringComparison.Ordinal)) { isClearAdaptiveData = true; lineStream.read(); continue; } if (line.Contains("ABSTRACT TRUNCATED")) { continue; } string[] fields = line.Split("\t", true); if (fields.Length == 2) { sentence.Add(fields[0]); tags.Add(fields[1]); } else { throw new IOException("Expected two fields per line in training data, got " + fields.Length + " for line '" + line + "'!"); } } if (sentence.Count > 0) { // convert name tags into spans IList <Span> names = new List <Span>(); int beginIndex = -1; int endIndex = -1; for (int i = 0; i < tags.Count; i++) { string tag = tags[i]; if (tag.EndsWith("DNA", StringComparison.Ordinal) && (types & GENERATE_DNA_ENTITIES) == 0) { tag = "O"; } if (tag.EndsWith("protein", StringComparison.Ordinal) && (types & GENERATE_PROTEIN_ENTITIES) == 0) { tag = "O"; } if (tag.EndsWith("cell_type", StringComparison.Ordinal) && (types & GENERATE_CELLTYPE_ENTITIES) == 0) { tag = "O"; } if (tag.EndsWith("cell_line", StringComparison.Ordinal) && (types & GENERATE_CELLTYPE_ENTITIES) == 0) { tag = "O"; } if (tag.EndsWith("RNA", StringComparison.Ordinal) && (types & GENERATE_RNA_ENTITIES) == 0) { tag = "O"; } if (tag.StartsWith("B-", StringComparison.Ordinal)) { if (beginIndex != -1) { names.Add(new Span(beginIndex, endIndex, tags[beginIndex].Substring(2))); beginIndex = -1; endIndex = -1; } beginIndex = i; endIndex = i + 1; } else if (tag.StartsWith("I-", StringComparison.Ordinal)) { endIndex++; } else if (tag.Equals("O")) { if (beginIndex != -1) { names.Add(new Span(beginIndex, endIndex, tags[beginIndex].Substring(2))); beginIndex = -1; endIndex = -1; } } else { throw new IOException("Invalid tag: " + tag); } } // if one span remains, create it here if (beginIndex != -1) { names.Add(new Span(beginIndex, endIndex, tags[beginIndex].Substring(2))); } return(new NameSample(sentence.ToArray(), names.ToArray(), isClearAdaptiveData)); } else if (line != null) { // Just filter out empty events, if two lines in a row are empty return(read()); } else { // source stream is not returning anymore lines return(null); } }