Beispiel #1
0
        private static TaggedToken[][][] GetFold(TaggedToken[][] sentences, int foldsCount, int developmentPercentage, int testPercentage, int foldNumber)
        {
            int j, k;

            TaggedToken[][][] parts = new TaggedToken[3][][];

            List <int> order = new List <int>(sentences.Length);

            for (j = 0; j < sentences.Length; j++)
            {
                order.Add(j);
            }

            Collections.Shuffle(order, new Random(1));

            int developmentCount = developmentPercentage * sentences.Length / 1000;

            int testCount = testPercentage * sentences.Length / 1000;

            int trainCount = sentences.Length - (developmentCount + testCount);

            parts[0] = new TaggedToken[trainCount][];

            parts[1] = new TaggedToken[developmentCount][];

            parts[2] = new TaggedToken[testCount][];

            int factor = sentences.Length * foldNumber / foldsCount;

            for (j = 0, k = 0; j < factor; j++)
            {
                parts[0][k++] = sentences[j];
            }

            for (j = factor + developmentCount + testCount; j < sentences.Length; j++)
            {
                parts[0][k++] = sentences[j];
            }

            for (j = 0; j < developmentCount; j++)
            {
                parts[1][j] = sentences[factor + j];
            }

            for (j = 0; j < testCount; j++)
            {
                parts[2][j] = sentences[factor + developmentCount + j];
            }

            return(parts);
        }
Beispiel #2
0
        public TaggedToken(TaggedToken taggedToken)
        {
            Token = taggedToken.Token;

            LowerCaseText = taggedToken.LowerCaseText;

            Id = taggedToken.Id;

            Lemma = taggedToken.Lemma;

            PosTag = taggedToken.PosTag;

            NeTag = taggedToken.NeTag;

            NeTypeTag = taggedToken.NeTypeTag;
        }
Beispiel #3
0
        /// <summary>
        /// Internal recursive function for visiting all of the tokens in the correct order and
        /// creating the patterns
        /// </summary>
        /// <param name="tokens">
        /// List of tokens to be patterned
        /// </param>
        /// <param name="patterns">
        /// The list of patterns created so far
        /// </param>
        /// <param name="targetToken">
        /// The number token that should be worked on (visited)
        /// </param>
        /// <returns>
        /// RECURSIVE:
        /// eventually returns the complete set of patterns that the list of tokens can be represented by.
        /// </returns>
        private static List <string> internalMakePatterns(List <TaggedToken> tokens, IList <string> patterns, int targetToken)
        {
            if (tokens == null)
            {
                throw new ArgumentNullException("tokens");
            }

            if (patterns == null)
            {
                throw new ArgumentNullException("patterns");
            }

            if (targetToken < 0 || targetToken > tokens.Count)
            {
                new ArgumentOutOfRangeException("targetToken",
                                                String.Format("targetToken needs to greater than 0 and less than the number of tokens.  Value was : {0}"));
            }

            TaggedToken   t       = tokens[targetToken];
            List <string> newList = new List <string>();

            foreach (var item in t.Tags)
            {
                if (targetToken == 0)
                {
                    newList.Add(item);
                }
                foreach (var pattern in patterns)
                {
                    string newitem = pattern + item;
                    if (!newList.Contains(newitem))
                    {
                        newList.Add(newitem);
                    }
                }
            }

            if (targetToken + 1 == tokens.Count())
            {
                return(newList);
            }
            else
            {
                return(internalMakePatterns(tokens, newList, targetToken + 1));
            }
        }
Beispiel #4
0
        /// <summary>
        /// Given an input string, split into composite tokens.  In the process, consult the lexicon for matches and
        /// apply the tags from the lexicon to the split tokens.
        /// </summary>
        /// <param name="In">
        /// The string to be tokenized and tagged
        /// </param>
        /// <returns>
        /// A list of Tagged Tokens
        /// </returns>
        public List <TaggedToken> Tag(String In)
        {
            if (In == null)
            {
                throw new ArgumentNullException("In");
            }

            List <TaggedToken> ret = new List <TaggedToken>();

            // Split string using the configured separators and split options
            string[] rawtokens = In.Split(_options.Separators.ToArray <string>(), _options.StringSplitOptions);

            // For each token
            for (int i = 0; i < rawtokens.LongCount(); i++)
            {
                // Make a new model object
                var token = new TaggedToken()
                {
                    // Value of the token
                    Value = rawtokens[i],
                    // position of the token in the input string (the nth token)
                    TokenSequence = i
                };

                // Attempt to search the lexicon for a matching token.
                LexiconEntry found;
                string       sought = _options.MatchesAreCaseSensitive ? token.Value : token.Value.ToLower();
                if (_lexicon.TryGetValue(sought, out found))
                {
                    token.Tags = found.Tags;
                }
                else
                {
                    // Apply a placeholder tag for tokens not in the lexicon
                    token.Tags.Add(_options.EmptyTagValue);
                }

                // Add the token to the list
                ret.Add(token);
            }

            // return the list
            return(ret);
        }
Beispiel #5
0
        public bool ConsistentWith(TaggedToken taggedToken)
        {
            if (PosTag >= 0 && taggedToken.PosTag >= 0 && PosTag != taggedToken.PosTag)
            {
                return(false);
            }

            if (Lemma != null && taggedToken.Lemma != null && !Lemma.Equals(taggedToken.Lemma))
            {
                return(false);
            }

            if (NeTag >= 0 && taggedToken.NeTag >= 0 && NeTag != taggedToken.NeTag)
            {
                return(false);
            }

            return(NeTypeTag < 0 || taggedToken.NeTypeTag < 0 || NeTypeTag == taggedToken.NeTypeTag);
        }
Beispiel #6
0
        private static TaggedToken[][][] GetSUCFold(TaggedToken[][] sentences, int fold)
        {
            TaggedToken[][][] parts = new TaggedToken[3][][];

            HashSet <string> fileSet = new HashSet <string>();

            foreach (TaggedToken[] sentence in sentences)
            {
                string fileId = sentence[0].Id.Substring(0, 4);

                fileSet.Add(fileId);
            }

            List <string> files = new List <string>(fileSet);

            Collections.Sort(files);

            Debug.Assert(files.Count == 500);

            Dictionary <string, int> fileIndex = new Dictionary <string, int>();

            for (int i = 0; i < files.Count; i++)
            {
                string fileId = files[i];

                fileIndex[fileId] = i;
            }

            int developmentCount = 0, testCount = 0, trainCount = 0;

            foreach (TaggedToken[] sentence in sentences)
            {
                string fileId = sentence[0].Id.Substring(0, 4);

                int index = fileIndex[fileId];

                if ((index % 10) == fold)
                {
                    testCount++;
                }
                else if ((((index + 1) % 10) == fold) && (index / 10 % 5 == 0))
                {
                    developmentCount++;
                }

                else
                {
                    trainCount++;
                }
            }

            parts[0] = new TaggedToken[trainCount][];

            parts[1] = new TaggedToken[developmentCount][];

            parts[2] = new TaggedToken[testCount][];

            int developmentIndex = 0, testIndex = 0, trainIndex = 0;

            foreach (TaggedToken[] sentence in sentences)
            {
                string fileId = sentence[0].Id.Substring(0, 4);

                int index = fileIndex[fileId];

                if ((index % 10) == fold)
                {
                    parts[2][testIndex++] = sentence;
                }
                else if ((((index + 1) % 10) == fold) && (index / 10 % 5 == 0))
                {
                    parts[1][developmentIndex++] = sentence;
                }
                else
                {
                    parts[0][trainIndex++] = sentence;
                }
            }

            return(parts);
        }
Beispiel #7
0
        public static void Main(string[] args)
        {
            string lexiconFile = null;

            string trainFile = null;

            string developmentFile = null;

            string modelFile = null;

            List <Dictionary> posDictionaries = new List <Dictionary>();

            List <Embedding> posEmbeddings = new List <Embedding>();

            List <Dictionary> neDictionaries = new List <Dictionary>();

            List <Embedding> neEmbeddings = new List <Embedding>();

            int posBeamSize = 8;

            int neBeamSize = 4;

            string language = null;

            bool preserve = false;

            bool plainOutput = false;

            string fold = null;

            int maximumPosIterations = 16;

            int maximumNeIterations = 16;

            bool extendLexicon = true;

            bool hasNe = true;

            for (int i = 0; i < args.Length; i++)
            {
                if (args[i].Equals("-lexicon"))
                {
                    lexiconFile = args[++i];
                }
                else if (args[i].Equals("-dict"))
                {
                    string destination = args[++i];

                    Dictionary dictionary = new Dictionary();

                    try
                    {
                        dictionary.FromFile(args[++i]);
                    }
                    catch (IOException e)
                    {
                        Console.WriteLine("Can not load dictionary file.");

                        Console.WriteLine(e.StackTrace);

                        Environment.Exit(1);
                    }

                    if (destination.Equals("pos"))
                    {
                        posDictionaries.Add(dictionary);
                    }
                    else if (destination.Equals("ne"))
                    {
                        neDictionaries.Add(dictionary);
                    }
                    else if (destination.Equals("all"))
                    {
                        posDictionaries.Add(dictionary);

                        neDictionaries.Add(dictionary);
                    }
                    else
                    {
                        Console.WriteLine("Expected pos/ne/all.");

                        Environment.Exit(1);
                    }
                }
                else if (args[i].Equals("-lang"))
                {
                    language = args[++i];
                }
                else if (args[i].Equals("-extendlexicon"))
                {
                    extendLexicon = true;
                }
                else if (args[i].Equals("-noextendlexicon"))
                {
                    extendLexicon = false;
                }
                else if (args[i].Equals("-noner"))
                {
                    hasNe = false;
                }
                else if (args[i].Equals("-positers"))
                {
                    maximumPosIterations = int.Parse(args[++i]);
                }
                else if (args[i].Equals("-neiters"))
                {
                    maximumNeIterations = int.Parse(args[++i]);
                }
                else if (args[i].Equals("-posbeamsize"))
                {
                    posBeamSize = int.Parse(args[++i]);
                }
                else if (args[i].Equals("-nebeamsize"))
                {
                    neBeamSize = int.Parse(args[++i]);
                }
                else if (args[i].Equals("-preserve"))
                {
                    preserve = true;
                }
                else if (args[i].Equals("-plain"))
                {
                    plainOutput = true;
                }
                else if (args[i].Equals("-fold"))
                {
                    fold = args[++i];
                }
                else if (args[i].Equals("-embed"))
                {
                    string destination = args[++i];

                    Embedding embedding = new Embedding();

                    try
                    {
                        embedding.FromFile(args[++i]);
                    }
                    catch (IOException e)
                    {
                        Console.WriteLine("Can not load embedding file.");

                        Console.WriteLine(e.StackTrace);

                        Environment.Exit(1);
                    }

                    if (destination.Equals("pos"))
                    {
                        posEmbeddings.Add(embedding);
                    }
                    else if (destination.Equals("ne"))
                    {
                        neEmbeddings.Add(embedding);
                    }
                    else if (destination.Equals("all"))
                    {
                        posEmbeddings.Add(embedding);

                        neEmbeddings.Add(embedding);
                    }
                    else
                    {
                        Console.WriteLine("Expected pos/ne/all.");

                        Environment.Exit(1);
                    }
                }
                else if (args[i].Equals("-trainfile"))
                {
                    trainFile = args[++i];
                }
                else if (args[i].Equals("-devfile"))
                {
                    developmentFile = args[++i];
                }
                else if (args[i].Equals("-modelfile"))
                {
                    modelFile = args[++i];
                }
                else if (args[i].Equals("-train"))
                {
                    TaggedToken[][] developmentSentences = null;

                    if (trainFile == null || modelFile == null || language == null)
                    {
                        Console.WriteLine("Insufficient data.");

                        Environment.Exit(1);
                    }

                    TaggedData taggedData = new TaggedData(language);

                    TaggedToken[][] trainSentences = taggedData.ReadConll(trainFile, null, true, !trainFile.EndsWith(".conll"));

                    if (developmentFile != null)
                    {
                        developmentSentences = taggedData.ReadConll(developmentFile, null, true, !developmentFile.EndsWith(".conll"));
                    }

                    Console.WriteLine($"Read {trainSentences.Length} training sentences and {developmentSentences?.Length ?? 0} development sentences.");

                    Tagger tagger = GetTagger(language, taggedData, posBeamSize, neBeamSize);

                    tagger.BuildLexicons(trainSentences);

                    Lexicon lexicon = tagger.PosLexicon;

                    Console.WriteLine($"POS lexicon size (corpus) {lexicon.Size}.");

                    if (lexiconFile != null)
                    {
                        Console.WriteLine(extendLexicon ? $"Reading lexicon '{lexiconFile}'." : $"Reading lexicon (not extending profiles) '{lexiconFile}'.");

                        lexicon.FromFile(lexiconFile, taggedData.PosTagSet, extendLexicon);

                        Console.WriteLine($"POS lexicon size (external) {lexicon.Size}.");
                    }

                    tagger.PosDictionaries = posDictionaries;

                    tagger.PosEmbeddings = posEmbeddings;

                    tagger.NeDictionaries = neDictionaries;

                    tagger.NeEmbeddings = neEmbeddings;

                    tagger.MaximumPosIterations = maximumPosIterations;

                    tagger.MaximumNeIterations = maximumNeIterations;

                    tagger.Train(trainSentences, developmentSentences);

                    BinaryFormatter formatter = new BinaryFormatter();

                    formatter.Serialize(new FileStream(modelFile, FileMode.Create), tagger);
                }
                else if (args[i].Equals("-cross"))
                {
                    TaggedData taggedData = new TaggedData(language);

                    TaggedToken[][] allSentences = taggedData.ReadConll(trainFile, null, true, !trainFile.EndsWith(".conll"));

                    Tagger tagger = GetTagger(language, taggedData, posBeamSize, neBeamSize);

                    tagger.PosDictionaries = posDictionaries;

                    tagger.PosEmbeddings = posEmbeddings;

                    tagger.NeDictionaries = neDictionaries;

                    tagger.NeEmbeddings = neEmbeddings;

                    const int foldsCount = 10;

                    Evaluation evaluation = new Evaluation();

                    for (int j = 0; j < foldsCount; j++)
                    {
                        Evaluation localEvaluation = new Evaluation();

                        TaggedToken[][][] parts = GetSUCFold(allSentences, j);

                        Console.WriteLine($"Fold {j}, train ({parts[0].Length}), development ({parts[1].Length}), test ({parts[2].Length})");

                        Lexicon lexicon = tagger.PosLexicon;

                        lexicon.Clear();

                        tagger.BuildLexicons(parts[0]);

                        if (lexiconFile != null)
                        {
                            lexicon.FromFile(lexiconFile, taggedData.PosTagSet, extendLexicon);
                        }

                        tagger.Train(parts[0], parts[1]);

                        foreach (TaggedToken[] sentence in parts[2])
                        {
                            TaggedToken[] taggedSentence = tagger.TagSentence(sentence, true, false);

                            evaluation.Evaluate(taggedSentence, sentence);

                            localEvaluation.Evaluate(taggedSentence, sentence);

                            tagger.TaggedData.WriteConllGold(new StreamWriter(Console.OpenStandardOutput()), taggedSentence, sentence, plainOutput);
                        }

                        Console.WriteLine($"Local POS accuracy: {localEvaluation.GetPosAccuracy()} ({localEvaluation.PosCorrect} / {localEvaluation.PosTotal})");
                    }

                    Console.WriteLine($"POS accuracy: {evaluation.GetPosAccuracy()} ({evaluation.PosCorrect} / {evaluation.PosTotal})");

                    Console.WriteLine($"NE precision: {evaluation.GetNePrecision()}");

                    Console.WriteLine($"NE recall:    {evaluation.GetNeRecall()}");

                    Console.WriteLine($"NE F-score:   {evaluation.GetNeFScore()}");

                    Console.WriteLine($"NE total:     {evaluation.NeTotal}");

                    Console.WriteLine($"NE correct:   {evaluation.NeCorrect}");

                    Console.WriteLine($"NE found:     {evaluation.NeFound}");
                }
                else if (args[i].Equals("-server"))
                {
                    if (modelFile == null || i >= args.Length - 1)
                    {
                        Console.WriteLine("Insufficient data.");

                        Environment.Exit(1);
                    }

                    IPAddress serverIp = Dns.GetHostAddresses(args[++i]).FirstOrDefault();

                    int serverPort = int.Parse(args[++i]);

                    BinaryFormatter formatter = new BinaryFormatter();

                    Console.WriteLine("Loading Stagger model ...");

                    Tagger tagger = (Tagger)formatter.Deserialize(new FileStream(modelFile, FileMode.Open));

                    language = tagger.TaggedData.Language;

                    TcpListener tcpListener = new TcpListener(serverIp, serverPort);

                    tcpListener.Start(4);

                    while (true)
                    {
                        Socket sock = null;

                        try
                        {
                            sock = tcpListener.AcceptSocket();

                            Console.WriteLine($"Connected to {sock.RemoteEndPoint}");

                            NetworkStream networkStream = new NetworkStream(sock);

                            byte[] lengthBuffer = new byte[4];

                            if (networkStream.Read(lengthBuffer) != 4)
                            {
                                throw new IOException("Can not read length.");
                            }

                            int length = BitConverter.ToInt32(lengthBuffer);

                            if (length < 1 || length > 100000)
                            {
                                throw new IOException($"Invalid data size {length}.");
                            }

                            byte[] dataBuf = new byte[length];
                            if (networkStream.Read(dataBuf) != length)
                            {
                                throw new IOException("Can not read data.");
                            }

                            StringReader reader = new StringReader(Encoding.UTF8.GetString(dataBuf));

                            StreamWriter writer = new StreamWriter(networkStream, Encoding.UTF8);

                            Tokenizer tokenizer = GetTokenizer(reader, language);

                            List <Token> sentence;

                            int sentenceIndex = 0;

                            string fileId = "net";

                            while ((sentence = tokenizer.ReadSentence()) != null)
                            {
                                TaggedToken[] taggedSentence = new TaggedToken[sentence.Count];

                                if (tokenizer.SentenceId != null)
                                {
                                    if (!fileId.Equals(tokenizer.SentenceId))
                                    {
                                        fileId = tokenizer.SentenceId;

                                        sentenceIndex = 0;
                                    }
                                }

                                for (int j = 0; j < sentence.Count; j++)
                                {
                                    Token token = sentence[j];

                                    var id = $"{fileId}:{sentenceIndex}:{token.Offset}";

                                    taggedSentence[j] = new TaggedToken(token, id);
                                }

                                TaggedToken[] taggedSent = tagger.TagSentence(taggedSentence, true, false);

                                tagger.TaggedData.WriteConllSentence(writer ?? new StreamWriter(Console.OpenStandardOutput()), taggedSent, plainOutput);

                                sentenceIndex++;
                            }

                            tokenizer.Close();

                            if (sock.Connected)
                            {
                                Console.WriteLine($"Closing connection to {sock.RemoteEndPoint}.");

                                writer.Close();
                            }
                        }
                        catch (IOException e)
                        {
                            Console.WriteLine(e.StackTrace);

                            if (sock != null)
                            {
                                Console.WriteLine($"Connection failed to {sock.RemoteEndPoint}.");

                                if (sock.Connected)
                                {
                                    sock.Close();
                                }
                            }
                        }
                    }
                }
                else if (args[i].Equals("-tag"))
                {
                    if (modelFile == null || i >= args.Length - 1)
                    {
                        Console.WriteLine("Insufficient data.");

                        Environment.Exit(1);
                    }

                    List <string> inputFiles = new List <string>();

                    for (i++; i < args.Length && !args[i].StartsWith("-"); i++)
                    {
                        inputFiles.Add(args[i]);
                    }

                    if (inputFiles.Count < 1)
                    {
                        Console.WriteLine("No files to tag.");

                        Environment.Exit(1);
                    }

                    BinaryFormatter formatter = new BinaryFormatter();

                    Console.WriteLine("Loading Stagger model ...");

                    Tagger tagger = (Tagger)formatter.Deserialize(new FileStream(modelFile, FileMode.Open));

                    language = tagger.TaggedData.Language;

                    tagger.ExtendLexicon = extendLexicon;

                    if (!hasNe)
                    {
                        tagger.HasNe = false;
                    }

                    foreach (string inputFile in inputFiles)
                    {
                        if (!(inputFile.EndsWith(".txt") || inputFile.EndsWith(".txt.gz")))
                        {
                            TaggedToken[][] inputSentence = tagger.TaggedData.ReadConll(inputFile, null, true, !inputFile.EndsWith(".conll"));

                            Evaluation evaluation = new Evaluation();

                            int count = 0;

                            StreamWriter writer = new StreamWriter(Console.OpenStandardOutput(), Encoding.UTF8);

                            foreach (TaggedToken[] sentence in inputSentence)
                            {
                                if (count % 100 == 0)
                                {
                                    Console.WriteLine($"Tagging sentence number {count}.\r");
                                }

                                count++;

                                TaggedToken[] taggedSentence = tagger.TagSentence(sentence, true, preserve);

                                evaluation.Evaluate(taggedSentence, sentence);

                                tagger.TaggedData.WriteConllGold(writer, taggedSentence, sentence, plainOutput);
                            }

                            writer.Close();

                            Console.WriteLine($"Tagging sentence number {count}.");

                            Console.WriteLine($"POS accuracy: {evaluation.GetPosAccuracy()} ({evaluation.PosCorrect} / {evaluation.PosTotal}).");

                            Console.WriteLine($"NE precision: {evaluation.GetNePrecision()}.");

                            Console.WriteLine($"NE recall:    {evaluation.GetNeRecall()}.");

                            Console.WriteLine($"NE F-score:   {evaluation.GetNeFScore()}.");
                        }
                        else
                        {
                            string fileId = Path.GetFileNameWithoutExtension(inputFile);

                            TextReader reader = OpenUtf8File(inputFile);

                            StreamWriter writer;

                            if (inputFiles.Count > 1)
                            {
                                string outputFile = $"{inputFile}{(plainOutput ? ".plain" : ".conll")}";

                                writer = new StreamWriter(new FileStream(outputFile, FileMode.Create), Encoding.UTF8);
                            }
                            else
                            {
                                writer = new StreamWriter(Console.OpenStandardOutput(), Encoding.UTF8);
                            }

                            Tokenizer tokenizer = GetTokenizer(reader, language);

                            List <Token> sentence;

                            int sentenceIndex = 0;

                            while ((sentence = tokenizer.ReadSentence()) != null)
                            {
                                TaggedToken[] sent = new TaggedToken[sentence.Count];

                                if (tokenizer.SentenceId != null)
                                {
                                    if (!fileId.Equals(tokenizer.SentenceId))
                                    {
                                        fileId = tokenizer.SentenceId;

                                        sentenceIndex = 0;
                                    }
                                }

                                for (int j = 0; j < sentence.Count; j++)
                                {
                                    Token tok = sentence[j];

                                    var id = $"{fileId}:{sentenceIndex}:{tok.Offset}";

                                    sent[j] = new TaggedToken(tok, id);
                                }

                                TaggedToken[] taggedSent = tagger.TagSentence(sent, true, false);

                                tagger.TaggedData.WriteConllSentence(writer ?? new StreamWriter(Console.OpenStandardOutput()), taggedSent, plainOutput);

                                sentenceIndex++;
                            }

                            tokenizer.Close();

                            writer?.Close();
                        }
                    }
                }
                else if (args[i].Equals("-tokenize"))
                {
                    string inputFile = args[++i];

                    TextReader reader = OpenUtf8File(inputFile);

                    Tokenizer tokenizer = GetTokenizer(reader, language);

                    List <Token> sentence;

                    while ((sentence = tokenizer.ReadSentence()) != null)
                    {
                        if (sentence.Count == 0)
                        {
                            continue;
                        }

                        if (!plainOutput)
                        {
                            Console.Write(sentence[0].Value.Replace(' ', '_'));

                            for (int j = 1; j < sentence.Count; j++)
                            {
                                Console.Write($" {sentence[j].Value.Replace(' ', '_')}");
                            }

                            Console.WriteLine("");
                        }
                        else
                        {
                            foreach (Token token in sentence)
                            {
                                Console.WriteLine(token.Value);
                            }

                            Console.WriteLine();
                        }
                    }

                    tokenizer.Close();
                }
            }
        }