Esempio n. 1
0
 private void _read()
 {
     if (HeaderParameters.Flags.HasCustomLookupTable)
     {
         _customLookupTable = new List <byte[]>();
         for (var i = 0; i < HeaderParameters.NumCustomLookupTableEntries; i++)
         {
             _customLookupTable.Add(m_io.ReadBytes(2));
         }
     }
     {
         bool on = HeaderParameters.Flags.Tagged;
         if (on == true)
         {
             __raw_data = m_io.ReadBytes(((M_Io.Size - M_Io.Pos) - (IsLenDecompressedOdd ? 1 : 0)));
             var io___raw_data = new KaitaiStream(__raw_data);
             _data = new TaggedData(io___raw_data, this, m_root);
         }
         else
         {
             __raw_data = m_io.ReadBytes(((M_Io.Size - M_Io.Pos) - (IsLenDecompressedOdd ? 1 : 0)));
             var io___raw_data = new KaitaiStream(__raw_data);
             _data = new UntaggedData(io___raw_data, this, m_root);
         }
     }
     if (IsLenDecompressedOdd)
     {
         _lastByte = m_io.ReadBytes(1);
     }
 }
Esempio n. 2
0
        private static Tagger GetTagger(string language, TaggedData taggedData, int posBeamSize, int neBeamSize)
        {
            Tagger tagger = null;

            if (language.Equals("sv"))
            {
                tagger = new SUCTagger(taggedData, posBeamSize, neBeamSize);
            }
            else if (language.Equals("en"))
            {
                tagger = new PTBTagger(taggedData, posBeamSize, neBeamSize);
            }
            else if (language.Equals("any"))
            {
                tagger = new GenericTagger(taggedData, posBeamSize, neBeamSize);
            }
            else if (language.Equals("zh"))
            {
                tagger = new CTBTagger(taggedData, posBeamSize, neBeamSize);
            }
            else
            {
                Console.WriteLine($"Invalid language '{language}'.");

                Environment.Exit(1);
            }

            return(tagger);
        }
        public TaggedData GetData()
        {
            TaggedData taggedData = new TaggedData {
                Data    = CollectionProvider(_context)[Index],
                DataTag = DataTag
            };

            return(taggedData);
        }
        public TaggedData LoadFromMemory(string dataTag)
        {
            Object dataToReturn;

            if (!Memory.TryGetValue(dataTag, out dataToReturn))
            {
                return(null);
            }
            TaggedData taggedData = new TaggedData {
                DataTag = dataTag, Data = dataToReturn
            };

            return(taggedData);
        }
Esempio n. 5
0
 public override object GetValue(NodePort port)
 {
     if (port.fieldName == "LinkedOption")
     {
         return(this);
     }
     if (port.fieldName == "DataOut")
     {
         if (_context == null)
         {
             return(null);
         }
         List <Object> collection = CollectionProvider(_context);
         if (collection != null && collection.Count > Index)
         {
             TaggedData taggedData = new TaggedData {
                 Data    = CollectionProvider(_context)[Index],
                 DataTag = DataTag
             };
             return(taggedData);
         }
     }
     return(null);
 }
Esempio n. 6
0
        public static void Main(string[] args)
        {
            string lexiconFile = null;

            string trainFile = null;

            string developmentFile = null;

            string modelFile = null;

            List <Dictionary> posDictionaries = new List <Dictionary>();

            List <Embedding> posEmbeddings = new List <Embedding>();

            List <Dictionary> neDictionaries = new List <Dictionary>();

            List <Embedding> neEmbeddings = new List <Embedding>();

            int posBeamSize = 8;

            int neBeamSize = 4;

            string language = null;

            bool preserve = false;

            bool plainOutput = false;

            string fold = null;

            int maximumPosIterations = 16;

            int maximumNeIterations = 16;

            bool extendLexicon = true;

            bool hasNe = true;

            for (int i = 0; i < args.Length; i++)
            {
                if (args[i].Equals("-lexicon"))
                {
                    lexiconFile = args[++i];
                }
                else if (args[i].Equals("-dict"))
                {
                    string destination = args[++i];

                    Dictionary dictionary = new Dictionary();

                    try
                    {
                        dictionary.FromFile(args[++i]);
                    }
                    catch (IOException e)
                    {
                        Console.WriteLine("Can not load dictionary file.");

                        Console.WriteLine(e.StackTrace);

                        Environment.Exit(1);
                    }

                    if (destination.Equals("pos"))
                    {
                        posDictionaries.Add(dictionary);
                    }
                    else if (destination.Equals("ne"))
                    {
                        neDictionaries.Add(dictionary);
                    }
                    else if (destination.Equals("all"))
                    {
                        posDictionaries.Add(dictionary);

                        neDictionaries.Add(dictionary);
                    }
                    else
                    {
                        Console.WriteLine("Expected pos/ne/all.");

                        Environment.Exit(1);
                    }
                }
                else if (args[i].Equals("-lang"))
                {
                    language = args[++i];
                }
                else if (args[i].Equals("-extendlexicon"))
                {
                    extendLexicon = true;
                }
                else if (args[i].Equals("-noextendlexicon"))
                {
                    extendLexicon = false;
                }
                else if (args[i].Equals("-noner"))
                {
                    hasNe = false;
                }
                else if (args[i].Equals("-positers"))
                {
                    maximumPosIterations = int.Parse(args[++i]);
                }
                else if (args[i].Equals("-neiters"))
                {
                    maximumNeIterations = int.Parse(args[++i]);
                }
                else if (args[i].Equals("-posbeamsize"))
                {
                    posBeamSize = int.Parse(args[++i]);
                }
                else if (args[i].Equals("-nebeamsize"))
                {
                    neBeamSize = int.Parse(args[++i]);
                }
                else if (args[i].Equals("-preserve"))
                {
                    preserve = true;
                }
                else if (args[i].Equals("-plain"))
                {
                    plainOutput = true;
                }
                else if (args[i].Equals("-fold"))
                {
                    fold = args[++i];
                }
                else if (args[i].Equals("-embed"))
                {
                    string destination = args[++i];

                    Embedding embedding = new Embedding();

                    try
                    {
                        embedding.FromFile(args[++i]);
                    }
                    catch (IOException e)
                    {
                        Console.WriteLine("Can not load embedding file.");

                        Console.WriteLine(e.StackTrace);

                        Environment.Exit(1);
                    }

                    if (destination.Equals("pos"))
                    {
                        posEmbeddings.Add(embedding);
                    }
                    else if (destination.Equals("ne"))
                    {
                        neEmbeddings.Add(embedding);
                    }
                    else if (destination.Equals("all"))
                    {
                        posEmbeddings.Add(embedding);

                        neEmbeddings.Add(embedding);
                    }
                    else
                    {
                        Console.WriteLine("Expected pos/ne/all.");

                        Environment.Exit(1);
                    }
                }
                else if (args[i].Equals("-trainfile"))
                {
                    trainFile = args[++i];
                }
                else if (args[i].Equals("-devfile"))
                {
                    developmentFile = args[++i];
                }
                else if (args[i].Equals("-modelfile"))
                {
                    modelFile = args[++i];
                }
                else if (args[i].Equals("-train"))
                {
                    TaggedToken[][] developmentSentences = null;

                    if (trainFile == null || modelFile == null || language == null)
                    {
                        Console.WriteLine("Insufficient data.");

                        Environment.Exit(1);
                    }

                    TaggedData taggedData = new TaggedData(language);

                    TaggedToken[][] trainSentences = taggedData.ReadConll(trainFile, null, true, !trainFile.EndsWith(".conll"));

                    if (developmentFile != null)
                    {
                        developmentSentences = taggedData.ReadConll(developmentFile, null, true, !developmentFile.EndsWith(".conll"));
                    }

                    Console.WriteLine($"Read {trainSentences.Length} training sentences and {developmentSentences?.Length ?? 0} development sentences.");

                    Tagger tagger = GetTagger(language, taggedData, posBeamSize, neBeamSize);

                    tagger.BuildLexicons(trainSentences);

                    Lexicon lexicon = tagger.PosLexicon;

                    Console.WriteLine($"POS lexicon size (corpus) {lexicon.Size}.");

                    if (lexiconFile != null)
                    {
                        Console.WriteLine(extendLexicon ? $"Reading lexicon '{lexiconFile}'." : $"Reading lexicon (not extending profiles) '{lexiconFile}'.");

                        lexicon.FromFile(lexiconFile, taggedData.PosTagSet, extendLexicon);

                        Console.WriteLine($"POS lexicon size (external) {lexicon.Size}.");
                    }

                    tagger.PosDictionaries = posDictionaries;

                    tagger.PosEmbeddings = posEmbeddings;

                    tagger.NeDictionaries = neDictionaries;

                    tagger.NeEmbeddings = neEmbeddings;

                    tagger.MaximumPosIterations = maximumPosIterations;

                    tagger.MaximumNeIterations = maximumNeIterations;

                    tagger.Train(trainSentences, developmentSentences);

                    BinaryFormatter formatter = new BinaryFormatter();

                    formatter.Serialize(new FileStream(modelFile, FileMode.Create), tagger);
                }
                else if (args[i].Equals("-cross"))
                {
                    TaggedData taggedData = new TaggedData(language);

                    TaggedToken[][] allSentences = taggedData.ReadConll(trainFile, null, true, !trainFile.EndsWith(".conll"));

                    Tagger tagger = GetTagger(language, taggedData, posBeamSize, neBeamSize);

                    tagger.PosDictionaries = posDictionaries;

                    tagger.PosEmbeddings = posEmbeddings;

                    tagger.NeDictionaries = neDictionaries;

                    tagger.NeEmbeddings = neEmbeddings;

                    const int foldsCount = 10;

                    Evaluation evaluation = new Evaluation();

                    for (int j = 0; j < foldsCount; j++)
                    {
                        Evaluation localEvaluation = new Evaluation();

                        TaggedToken[][][] parts = GetSUCFold(allSentences, j);

                        Console.WriteLine($"Fold {j}, train ({parts[0].Length}), development ({parts[1].Length}), test ({parts[2].Length})");

                        Lexicon lexicon = tagger.PosLexicon;

                        lexicon.Clear();

                        tagger.BuildLexicons(parts[0]);

                        if (lexiconFile != null)
                        {
                            lexicon.FromFile(lexiconFile, taggedData.PosTagSet, extendLexicon);
                        }

                        tagger.Train(parts[0], parts[1]);

                        foreach (TaggedToken[] sentence in parts[2])
                        {
                            TaggedToken[] taggedSentence = tagger.TagSentence(sentence, true, false);

                            evaluation.Evaluate(taggedSentence, sentence);

                            localEvaluation.Evaluate(taggedSentence, sentence);

                            tagger.TaggedData.WriteConllGold(new StreamWriter(Console.OpenStandardOutput()), taggedSentence, sentence, plainOutput);
                        }

                        Console.WriteLine($"Local POS accuracy: {localEvaluation.GetPosAccuracy()} ({localEvaluation.PosCorrect} / {localEvaluation.PosTotal})");
                    }

                    Console.WriteLine($"POS accuracy: {evaluation.GetPosAccuracy()} ({evaluation.PosCorrect} / {evaluation.PosTotal})");

                    Console.WriteLine($"NE precision: {evaluation.GetNePrecision()}");

                    Console.WriteLine($"NE recall:    {evaluation.GetNeRecall()}");

                    Console.WriteLine($"NE F-score:   {evaluation.GetNeFScore()}");

                    Console.WriteLine($"NE total:     {evaluation.NeTotal}");

                    Console.WriteLine($"NE correct:   {evaluation.NeCorrect}");

                    Console.WriteLine($"NE found:     {evaluation.NeFound}");
                }
                else if (args[i].Equals("-server"))
                {
                    if (modelFile == null || i >= args.Length - 1)
                    {
                        Console.WriteLine("Insufficient data.");

                        Environment.Exit(1);
                    }

                    IPAddress serverIp = Dns.GetHostAddresses(args[++i]).FirstOrDefault();

                    int serverPort = int.Parse(args[++i]);

                    BinaryFormatter formatter = new BinaryFormatter();

                    Console.WriteLine("Loading Stagger model ...");

                    Tagger tagger = (Tagger)formatter.Deserialize(new FileStream(modelFile, FileMode.Open));

                    language = tagger.TaggedData.Language;

                    TcpListener tcpListener = new TcpListener(serverIp, serverPort);

                    tcpListener.Start(4);

                    while (true)
                    {
                        Socket sock = null;

                        try
                        {
                            sock = tcpListener.AcceptSocket();

                            Console.WriteLine($"Connected to {sock.RemoteEndPoint}");

                            NetworkStream networkStream = new NetworkStream(sock);

                            byte[] lengthBuffer = new byte[4];

                            if (networkStream.Read(lengthBuffer) != 4)
                            {
                                throw new IOException("Can not read length.");
                            }

                            int length = BitConverter.ToInt32(lengthBuffer);

                            if (length < 1 || length > 100000)
                            {
                                throw new IOException($"Invalid data size {length}.");
                            }

                            byte[] dataBuf = new byte[length];
                            if (networkStream.Read(dataBuf) != length)
                            {
                                throw new IOException("Can not read data.");
                            }

                            StringReader reader = new StringReader(Encoding.UTF8.GetString(dataBuf));

                            StreamWriter writer = new StreamWriter(networkStream, Encoding.UTF8);

                            Tokenizer tokenizer = GetTokenizer(reader, language);

                            List <Token> sentence;

                            int sentenceIndex = 0;

                            string fileId = "net";

                            while ((sentence = tokenizer.ReadSentence()) != null)
                            {
                                TaggedToken[] taggedSentence = new TaggedToken[sentence.Count];

                                if (tokenizer.SentenceId != null)
                                {
                                    if (!fileId.Equals(tokenizer.SentenceId))
                                    {
                                        fileId = tokenizer.SentenceId;

                                        sentenceIndex = 0;
                                    }
                                }

                                for (int j = 0; j < sentence.Count; j++)
                                {
                                    Token token = sentence[j];

                                    var id = $"{fileId}:{sentenceIndex}:{token.Offset}";

                                    taggedSentence[j] = new TaggedToken(token, id);
                                }

                                TaggedToken[] taggedSent = tagger.TagSentence(taggedSentence, true, false);

                                tagger.TaggedData.WriteConllSentence(writer ?? new StreamWriter(Console.OpenStandardOutput()), taggedSent, plainOutput);

                                sentenceIndex++;
                            }

                            tokenizer.Close();

                            if (sock.Connected)
                            {
                                Console.WriteLine($"Closing connection to {sock.RemoteEndPoint}.");

                                writer.Close();
                            }
                        }
                        catch (IOException e)
                        {
                            Console.WriteLine(e.StackTrace);

                            if (sock != null)
                            {
                                Console.WriteLine($"Connection failed to {sock.RemoteEndPoint}.");

                                if (sock.Connected)
                                {
                                    sock.Close();
                                }
                            }
                        }
                    }
                }
                else if (args[i].Equals("-tag"))
                {
                    if (modelFile == null || i >= args.Length - 1)
                    {
                        Console.WriteLine("Insufficient data.");

                        Environment.Exit(1);
                    }

                    List <string> inputFiles = new List <string>();

                    for (i++; i < args.Length && !args[i].StartsWith("-"); i++)
                    {
                        inputFiles.Add(args[i]);
                    }

                    if (inputFiles.Count < 1)
                    {
                        Console.WriteLine("No files to tag.");

                        Environment.Exit(1);
                    }

                    BinaryFormatter formatter = new BinaryFormatter();

                    Console.WriteLine("Loading Stagger model ...");

                    Tagger tagger = (Tagger)formatter.Deserialize(new FileStream(modelFile, FileMode.Open));

                    language = tagger.TaggedData.Language;

                    tagger.ExtendLexicon = extendLexicon;

                    if (!hasNe)
                    {
                        tagger.HasNe = false;
                    }

                    foreach (string inputFile in inputFiles)
                    {
                        if (!(inputFile.EndsWith(".txt") || inputFile.EndsWith(".txt.gz")))
                        {
                            TaggedToken[][] inputSentence = tagger.TaggedData.ReadConll(inputFile, null, true, !inputFile.EndsWith(".conll"));

                            Evaluation evaluation = new Evaluation();

                            int count = 0;

                            StreamWriter writer = new StreamWriter(Console.OpenStandardOutput(), Encoding.UTF8);

                            foreach (TaggedToken[] sentence in inputSentence)
                            {
                                if (count % 100 == 0)
                                {
                                    Console.WriteLine($"Tagging sentence number {count}.\r");
                                }

                                count++;

                                TaggedToken[] taggedSentence = tagger.TagSentence(sentence, true, preserve);

                                evaluation.Evaluate(taggedSentence, sentence);

                                tagger.TaggedData.WriteConllGold(writer, taggedSentence, sentence, plainOutput);
                            }

                            writer.Close();

                            Console.WriteLine($"Tagging sentence number {count}.");

                            Console.WriteLine($"POS accuracy: {evaluation.GetPosAccuracy()} ({evaluation.PosCorrect} / {evaluation.PosTotal}).");

                            Console.WriteLine($"NE precision: {evaluation.GetNePrecision()}.");

                            Console.WriteLine($"NE recall:    {evaluation.GetNeRecall()}.");

                            Console.WriteLine($"NE F-score:   {evaluation.GetNeFScore()}.");
                        }
                        else
                        {
                            string fileId = Path.GetFileNameWithoutExtension(inputFile);

                            TextReader reader = OpenUtf8File(inputFile);

                            StreamWriter writer;

                            if (inputFiles.Count > 1)
                            {
                                string outputFile = $"{inputFile}{(plainOutput ? ".plain" : ".conll")}";

                                writer = new StreamWriter(new FileStream(outputFile, FileMode.Create), Encoding.UTF8);
                            }
                            else
                            {
                                writer = new StreamWriter(Console.OpenStandardOutput(), Encoding.UTF8);
                            }

                            Tokenizer tokenizer = GetTokenizer(reader, language);

                            List <Token> sentence;

                            int sentenceIndex = 0;

                            while ((sentence = tokenizer.ReadSentence()) != null)
                            {
                                TaggedToken[] sent = new TaggedToken[sentence.Count];

                                if (tokenizer.SentenceId != null)
                                {
                                    if (!fileId.Equals(tokenizer.SentenceId))
                                    {
                                        fileId = tokenizer.SentenceId;

                                        sentenceIndex = 0;
                                    }
                                }

                                for (int j = 0; j < sentence.Count; j++)
                                {
                                    Token tok = sentence[j];

                                    var id = $"{fileId}:{sentenceIndex}:{tok.Offset}";

                                    sent[j] = new TaggedToken(tok, id);
                                }

                                TaggedToken[] taggedSent = tagger.TagSentence(sent, true, false);

                                tagger.TaggedData.WriteConllSentence(writer ?? new StreamWriter(Console.OpenStandardOutput()), taggedSent, plainOutput);

                                sentenceIndex++;
                            }

                            tokenizer.Close();

                            writer?.Close();
                        }
                    }
                }
                else if (args[i].Equals("-tokenize"))
                {
                    string inputFile = args[++i];

                    TextReader reader = OpenUtf8File(inputFile);

                    Tokenizer tokenizer = GetTokenizer(reader, language);

                    List <Token> sentence;

                    while ((sentence = tokenizer.ReadSentence()) != null)
                    {
                        if (sentence.Count == 0)
                        {
                            continue;
                        }

                        if (!plainOutput)
                        {
                            Console.Write(sentence[0].Value.Replace(' ', '_'));

                            for (int j = 1; j < sentence.Count; j++)
                            {
                                Console.Write($" {sentence[j].Value.Replace(' ', '_')}");
                            }

                            Console.WriteLine("");
                        }
                        else
                        {
                            foreach (Token token in sentence)
                            {
                                Console.WriteLine(token.Value);
                            }

                            Console.WriteLine();
                        }
                    }

                    tokenizer.Close();
                }
            }
        }
Esempio n. 7
0
        public void ConvertTypeTest_String()
        {
            var testData = new TaggedData("  8", "ThisIsALayer");

            Assert.IsTrue(testData.GetValueType == typeof(string));
        }
Esempio n. 8
0
        public void ConvertTypeTest_DoubleCast()
        {
            var testData = new TaggedData(" 10", "100.2");

            Assert.IsTrue(testData.GetValueType == typeof(double));
        }