public virtual StringList read()
        {
            string     line = lineStream.read();
            StringList name = null;

            if ((line != null) && (!StringUtil.isEmpty(line)))
            {
                string name2;
                // find the location of the name separator in the line of data.
                int pos = line.IndexOf(' ');
                if ((pos != -1))
                {
                    string parsed = line.Substring(0, pos);
                    // the data is in ALL CAPS ... so the easiest way is to convert
                    // back to standard mixed case.
                    if ((parsed.Length > 2) && (parsed.StartsWith("MC", StringComparison.Ordinal)))
                    {
                        name2 = parsed.Substring(0, 1).ToUpper(locale) + parsed.Substring(1, 1).ToLower(locale) + parsed.Substring(2, 1).ToUpper(locale) + parsed.Substring(3).ToLower(locale);
                    }
                    else
                    {
                        name2 = parsed.Substring(0, 1).ToUpper(locale) + parsed.Substring(1).ToLower(locale);
                    }
                    name = new StringList(new string[] { name2 });
                }
            }

            return(name);
        }
Exemple #2
0
        /// <summary>
        /// Creates a dictionary.
        /// </summary>
        /// <param name="sampleStream"> stream of samples. </param>
        /// <returns> a {@code Dictionary} class containing the name dictionary
        ///    built from the input file. </returns>
        /// <exception cref="IOException"> IOException </exception>
        public static Dictionary createDictionary(ObjectStream <StringList> sampleStream)
        {
            Dictionary mNameDictionary = new Dictionary(true);
            StringList entry;

            entry = sampleStream.read();
            while (entry != null)
            {
                if (!mNameDictionary.contains(entry))
                {
                    mNameDictionary.put(entry);
                }
                entry = sampleStream.read();
            }

            return(mNameDictionary);
        }
        public override void run(string format, string[] args)
        {
            if (0 == args.Length)
            {
                Console.WriteLine(Help);
            }
            else
            {
                format = args[0];
                ObjectStreamFactory streamFactory = getStreamFactory(format);

                string[] formatArgs = new string[args.Length - 1];
                Array.Copy(args, 1, formatArgs, 0, formatArgs.Length);

                string helpString = createHelpString(format, ArgumentParser.createUsage(streamFactory.Parameters));
                if (0 == formatArgs.Length || (1 == formatArgs.Length && "help".Equals(formatArgs[0])))
                {
                    Console.WriteLine(helpString);
                    Environment.Exit(0);
                }

                string errorMessage = ArgumentParser.validateArgumentsLoudly(formatArgs, streamFactory.Parameters);
                if (null != errorMessage)
                {
                    throw new TerminateToolException(1, errorMessage + "\n" + helpString);
                }

                ObjectStream <T> sampleStream = streamFactory.create(formatArgs);

                try
                {
                    object sample;
                    while ((sample = sampleStream.read()) != null)
                    {
                        Console.WriteLine(sample.ToString());
                    }
                }
                catch (IOException e)
                {
                    throw new TerminateToolException(-1, "IO error while converting data : " + e.Message, e);
                }
                finally
                {
                    if (sampleStream != null)
                    {
                        try
                        {
                            sampleStream.close();
                        }
                        catch (IOException)
                        {
                            // sorry that this can fail
                        }
                    }
                }
            }
        }
Exemple #4
0
        public virtual NameSample read()
        {
            IList <string> sentence = new List <string>();
            IList <string> tags     = new List <string>();

            bool isClearAdaptiveData = false;

            // Empty line indicates end of sentence

            string line;

            while ((line = lineStream.read()) != null && !StringUtil.isEmpty(line))
            {
                if (LANGUAGE.NL.Equals(lang) && line.StartsWith(DOCSTART, StringComparison.Ordinal))
                {
                    isClearAdaptiveData = true;
                    continue;
                }

                string[] fields = line.Split(' ');

                if (fields.Length == 3)
                {
                    sentence.Add(fields[0]);
                    tags.Add(fields[2]);
                }
                else
                {
                    throw new IOException("Expected three fields per line in training data, got " + fields.Length + " for line '" + line + "'!");
                }
            }

            // Always clear adaptive data for spanish
            if (LANGUAGE.ES.Equals(lang))
            {
                isClearAdaptiveData = true;
            }

            if (sentence.Count > 0)
            {
                // convert name tags into spans
                IList <Span> names = new List <Span>();

                int beginIndex = -1;
                int endIndex   = -1;
                for (int i = 0; i < tags.Count; i++)
                {
                    string tag = tags[i];

                    if (tag.EndsWith("PER", StringComparison.Ordinal) && (types & GENERATE_PERSON_ENTITIES) == 0)
                    {
                        tag = "O";
                    }

                    if (tag.EndsWith("ORG", StringComparison.Ordinal) && (types & GENERATE_ORGANIZATION_ENTITIES) == 0)
                    {
                        tag = "O";
                    }

                    if (tag.EndsWith("LOC", StringComparison.Ordinal) && (types & GENERATE_LOCATION_ENTITIES) == 0)
                    {
                        tag = "O";
                    }

                    if (tag.EndsWith("MISC", StringComparison.Ordinal) && (types & GENERATE_MISC_ENTITIES) == 0)
                    {
                        tag = "O";
                    }

                    if (tag.StartsWith("B-", StringComparison.Ordinal))
                    {
                        if (beginIndex != -1)
                        {
                            names.Add(extract(beginIndex, endIndex, tags[beginIndex]));
                            beginIndex = -1;
                            endIndex   = -1;
                        }

                        beginIndex = i;
                        endIndex   = i + 1;
                    }
                    else if (tag.StartsWith("I-", StringComparison.Ordinal))
                    {
                        endIndex++;
                    }
                    else if (tag.Equals("O"))
                    {
                        if (beginIndex != -1)
                        {
                            names.Add(extract(beginIndex, endIndex, tags[beginIndex]));
                            beginIndex = -1;
                            endIndex   = -1;
                        }
                    }
                    else
                    {
                        throw new IOException("Invalid tag: " + tag);
                    }
                }

                // if one span remains, create it here
                if (beginIndex != -1)
                {
                    names.Add(extract(beginIndex, endIndex, tags[beginIndex]));
                }

                return(new NameSample(sentence.ToArray(), names.ToArray(), isClearAdaptiveData));
            }
            else if (line != null)
            {
                // Just filter out empty events, if two lines in a row are empty
                return(read());
            }
            else
            {
                // source stream is not returning anymore lines
                return(null);
            }
        }
Exemple #5
0
        public virtual NameSample read()
        {
            IList <string> sentence = new List <string>();
            IList <string> tags     = new List <string>();

            bool isClearAdaptiveData = false;

            // Empty line indicates end of sentence

            string line;

            while ((line = lineStream.read()) != null && !StringUtil.isEmpty(line))
            {
                if (line.StartsWith(Conll02NameSampleStream.DOCSTART, StringComparison.Ordinal))
                {
                    isClearAdaptiveData = true;
                    string emptyLine = lineStream.read();

                    if (!StringUtil.isEmpty(emptyLine))
                    {
                        throw new IOException("Empty line after -DOCSTART- not empty: '" + emptyLine + "'!");
                    }

                    continue;
                }

                string[] fields = line.Split(" ", true);

                // For English: WORD  POS-TAG SC-TAG NE-TAG
                if (LANGUAGE.EN.Equals(lang) && (fields.Length == 4))
                {
                    sentence.Add(fields[0]);
                    tags.Add(fields[3]);     // 3 is NE-TAG
                }
                // For German: WORD  LEMA-TAG POS-TAG SC-TAG NE-TAG
                else if (LANGUAGE.DE.Equals(lang) && (fields.Length == 5))
                {
                    sentence.Add(fields[0]);
                    tags.Add(fields[4]);     // 4 is NE-TAG
                }
                else
                {
                    throw new IOException("Incorrect number of fields per line for language: '" + line + "'!");
                }
            }

            if (sentence.Count > 0)
            {
                // convert name tags into spans
                IList <Span> names = new List <Span>();

                int beginIndex = -1;
                int endIndex   = -1;
                for (int i = 0; i < tags.Count; i++)
                {
                    string tag = tags[i];

                    if (tag.EndsWith("PER", StringComparison.Ordinal) && (types & Conll02NameSampleStream.GENERATE_PERSON_ENTITIES) == 0)
                    {
                        tag = "O";
                    }

                    if (tag.EndsWith("ORG", StringComparison.Ordinal) && (types & Conll02NameSampleStream.GENERATE_ORGANIZATION_ENTITIES) == 0)
                    {
                        tag = "O";
                    }

                    if (tag.EndsWith("LOC", StringComparison.Ordinal) && (types & Conll02NameSampleStream.GENERATE_LOCATION_ENTITIES) == 0)
                    {
                        tag = "O";
                    }

                    if (tag.EndsWith("MISC", StringComparison.Ordinal) && (types & Conll02NameSampleStream.GENERATE_MISC_ENTITIES) == 0)
                    {
                        tag = "O";
                    }

                    if (tag.Equals("O"))
                    {
                        // O means we don't have anything this round.
                        if (beginIndex != -1)
                        {
                            names.Add(extract(beginIndex, endIndex, tags[beginIndex]));
                            beginIndex = -1;
                            endIndex   = -1;
                        }
                    }
                    else if (tag.StartsWith("B-", StringComparison.Ordinal))
                    {
                        // B- prefix means we have two same entities next to each other
                        if (beginIndex != -1)
                        {
                            names.Add(extract(beginIndex, endIndex, tags[beginIndex]));
                        }
                        beginIndex = i;
                        endIndex   = i + 1;
                    }
                    else if (tag.StartsWith("I-", StringComparison.Ordinal))
                    {
                        // I- starts or continues a current name entity
                        if (beginIndex == -1)
                        {
                            beginIndex = i;
                            endIndex   = i + 1;
                        }
                        else if (!tag.EndsWith(tags[beginIndex].Substring(1), StringComparison.Ordinal))
                        {
                            // we have a new tag type following a tagged word series
                            // also may not have the same I- starting the previous!
                            names.Add(extract(beginIndex, endIndex, tags[beginIndex]));
                            beginIndex = i;
                            endIndex   = i + 1;
                        }
                        else
                        {
                            endIndex++;
                        }
                    }
                    else
                    {
                        throw new IOException("Invalid tag: " + tag);
                    }
                }

                // if one span remains, create it here
                if (beginIndex != -1)
                {
                    names.Add(extract(beginIndex, endIndex, tags[beginIndex]));
                }

                return(new NameSample(sentence.ToArray(), names.ToArray(), isClearAdaptiveData));
            }
            else if (line != null)
            {
                // Just filter out empty events, if two lines in a row are empty
                return(read());
            }
            else
            {
                // source stream is not returning anymore lines
                return(null);
            }
        }
        public virtual NameSample read()
        {
            IList <string> sentence = new List <string>();
            IList <string> tags     = new List <string>();

            bool isClearAdaptiveData = false;

            // Empty line indicates end of sentence

            string line;

            while ((line = lineStream.read()) != null && !StringUtil.isEmpty(line.Trim()))
            {
                if (line.StartsWith("###MEDLINE:", StringComparison.Ordinal))
                {
                    isClearAdaptiveData = true;
                    lineStream.read();
                    continue;
                }

                if (line.Contains("ABSTRACT TRUNCATED"))
                {
                    continue;
                }

                string[] fields = line.Split("\t", true);

                if (fields.Length == 2)
                {
                    sentence.Add(fields[0]);
                    tags.Add(fields[1]);
                }
                else
                {
                    throw new IOException("Expected two fields per line in training data, got " + fields.Length + " for line '" + line + "'!");
                }
            }

            if (sentence.Count > 0)
            {
                // convert name tags into spans
                IList <Span> names = new List <Span>();

                int beginIndex = -1;
                int endIndex   = -1;
                for (int i = 0; i < tags.Count; i++)
                {
                    string tag = tags[i];

                    if (tag.EndsWith("DNA", StringComparison.Ordinal) && (types & GENERATE_DNA_ENTITIES) == 0)
                    {
                        tag = "O";
                    }

                    if (tag.EndsWith("protein", StringComparison.Ordinal) && (types & GENERATE_PROTEIN_ENTITIES) == 0)
                    {
                        tag = "O";
                    }

                    if (tag.EndsWith("cell_type", StringComparison.Ordinal) && (types & GENERATE_CELLTYPE_ENTITIES) == 0)
                    {
                        tag = "O";
                    }

                    if (tag.EndsWith("cell_line", StringComparison.Ordinal) && (types & GENERATE_CELLTYPE_ENTITIES) == 0)
                    {
                        tag = "O";
                    }
                    if (tag.EndsWith("RNA", StringComparison.Ordinal) && (types & GENERATE_RNA_ENTITIES) == 0)
                    {
                        tag = "O";
                    }

                    if (tag.StartsWith("B-", StringComparison.Ordinal))
                    {
                        if (beginIndex != -1)
                        {
                            names.Add(new Span(beginIndex, endIndex, tags[beginIndex].Substring(2)));
                            beginIndex = -1;
                            endIndex   = -1;
                        }

                        beginIndex = i;
                        endIndex   = i + 1;
                    }
                    else if (tag.StartsWith("I-", StringComparison.Ordinal))
                    {
                        endIndex++;
                    }
                    else if (tag.Equals("O"))
                    {
                        if (beginIndex != -1)
                        {
                            names.Add(new Span(beginIndex, endIndex, tags[beginIndex].Substring(2)));
                            beginIndex = -1;
                            endIndex   = -1;
                        }
                    }
                    else
                    {
                        throw new IOException("Invalid tag: " + tag);
                    }
                }

                // if one span remains, create it here
                if (beginIndex != -1)
                {
                    names.Add(new Span(beginIndex, endIndex, tags[beginIndex].Substring(2)));
                }

                return(new NameSample(sentence.ToArray(), names.ToArray(), isClearAdaptiveData));
            }
            else if (line != null)
            {
                // Just filter out empty events, if two lines in a row are empty
                return(read());
            }
            else
            {
                // source stream is not returning anymore lines
                return(null);
            }
        }