Beispiel #1
0
 public HybridCorefAnnotator(Properties props)
 {
     // for backward compatibility
     try
     {
         // Load the default properties
         Properties corefProps = new Properties();
         try
         {
             using (BufferedReader reader = IOUtils.ReaderFromString("edu/stanford/nlp/hcoref/properties/coref-default-dep.properties"))
             {
                 corefProps.Load(reader);
             }
         }
         catch (IOException)
         {
         }
         // Add passed properties
         IEnumeration <object> keys = props.Keys;
         while (keys.MoveNext())
         {
             string key = keys.Current.ToString();
             corefProps.SetProperty(key, props.GetProperty(key));
         }
         // Create coref system
         corefSystem = new HybridCorefSystem(corefProps);
         OldFormat   = bool.ParseBoolean(props.GetProperty("oldCorefFormat", "false"));
     }
     catch (Exception e)
     {
         log.Error("cannot create HybridCorefAnnotator!");
         Sharpen.Runtime.PrintStackTrace(e);
         throw new Exception(e);
     }
 }
Beispiel #2
0
 public CorefAnnotator(Properties props)
 {
     this.props = props;
     try
     {
         // if user tries to run with coref.language = ENGLISH and coref.algorithm = hybrid, throw Exception
         // we do not support those settings at this time
         if (CorefProperties.Algorithm(props).Equals(CorefProperties.CorefAlgorithmType.Hybrid) && CorefProperties.GetLanguage(props).Equals(Locale.English))
         {
             log.Error("Error: coref.algorithm=hybrid is not supported for English, " + "please change coref.algorithm or coref.language");
             throw new Exception();
         }
         // suppress
         props.SetProperty("coref.printConLLLoadingMessage", "false");
         corefSystem = new CorefSystem(props);
         props.Remove("coref.printConLLLoadingMessage");
     }
     catch (Exception e)
     {
         log.Error("Error creating CorefAnnotator...terminating pipeline construction!");
         log.Error(e);
         throw new Exception(e);
     }
     // unless custom mention detection is set, just use the default coref mention detector
     performMentionDetection = !PropertiesUtils.GetBool(props, "coref.useCustomMentionDetection", false);
     if (performMentionDetection)
     {
         mentionAnnotator = new CorefMentionAnnotator(props);
     }
 }
 public EntityMentionsAnnotator(string name, Properties props)
 {
     // note: used in annotate.properties
     // if the user has supplied custom CoreAnnotations for the ner tags and entity mentions override the default keys
     try
     {
         if (props.Contains(name + ".nerCoreAnnotation"))
         {
             nerCoreAnnotationClass = (Type)Sharpen.Runtime.GetType(props.GetProperty(name + ".nerCoreAnnotation"));
         }
         if (props.Contains(name + ".nerNormalizedCoreAnnotation"))
         {
             nerNormalizedCoreAnnotationClass = (Type)Sharpen.Runtime.GetType(props.GetProperty(name + ".nerNormalizedCoreAnnotation"));
         }
         if (props.Contains(name + ".mentionsCoreAnnotation"))
         {
             mentionsCoreAnnotationClass = (Type)Sharpen.Runtime.GetType(props.GetProperty(name + ".mentionsCoreAnnotation"));
         }
     }
     catch (TypeLoadException e)
     {
         log.Error(e.Message);
     }
     chunkIdentifier = new LabeledChunkIdentifier();
     doAcronyms      = bool.ParseBoolean(props.GetProperty(name + ".acronyms", props.GetProperty("acronyms", "false")));
     // set up language info, this is needed for handling creating pronominal mentions
     entityMentionsLanguage = LanguageInfo.GetLanguageFromString(props.GetProperty(name + ".language", "en"));
 }
        private double InterpretCmdOutput()
        {
            string output = GetOutput();

            string[] parts       = output.Split("\\s+");
            int      fScoreIndex = 0;

            for (; fScoreIndex < parts.Length; fScoreIndex++)
            {
                if (parts[fScoreIndex].Equals("FB1:"))
                {
                    break;
                }
            }
            fScoreIndex += 1;
            if (fScoreIndex < parts.Length)
            {
                return(double.ParseDouble(parts[fScoreIndex]));
            }
            else
            {
                log.Error("in CRFClassifierEvaluator.interpretCmdOutput(), cannot find FB1 score in output:\n" + output);
                return(-1);
            }
        }
        private ICoreMap DoOneSentence(ICoreMap sentence)
        {
            IList <CoreLabel>  tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));
            IList <TaggedWord> tagged = null;

            if (tokens.Count <= maxSentenceLength)
            {
                try
                {
                    tagged = pos.TagSentence(tokens, this.reuseTags);
                }
                catch (OutOfMemoryException e)
                {
                    log.Error(e);
                    // Beware that we can now get an OOM in logging, too.
                    log.Warn("Tagging of sentence ran out of memory. " + "Will ignore and continue: " + SentenceUtils.ListToString(tokens));
                }
            }
            if (tagged != null)
            {
                for (int i = 0; i < sz; i++)
                {
                    tokens[i].Set(typeof(CoreAnnotations.PartOfSpeechAnnotation), tagged[i].Tag());
                }
            }
            else
            {
                foreach (CoreLabel token in tokens)
                {
                    token.Set(typeof(CoreAnnotations.PartOfSpeechAnnotation), "X");
                }
            }
            return(sentence);
        }
Beispiel #6
0
        /// <summary>A fast, rule-based tokenizer for Modern Standard French.</summary>
        /// <remarks>
        /// A fast, rule-based tokenizer for Modern Standard French.
        /// Performs punctuation splitting and light tokenization by default.
        /// <p>
        /// Currently, this tokenizer does not do line splitting. It assumes that the input
        /// file is delimited by the system line separator. The output will be equivalently
        /// delimited.
        /// </remarks>
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            Properties options = StringUtils.ArgsToProperties(args, ArgOptionDefs());

            if (options.Contains("help"))
            {
                log.Info(Usage());
                return;
            }
            // Lexer options
            ITokenizerFactory <CoreLabel> tf = options.Contains("ftb") ? FrenchTokenizer.FtbFactory() : FrenchTokenizer.Factory();
            string orthoOptions = options.GetProperty("options", string.Empty);

            // When called from this main method, split on newline. No options for
            // more granular sentence splitting.
            orthoOptions = orthoOptions.IsEmpty() ? "tokenizeNLs" : orthoOptions + ",tokenizeNLs";
            tf.SetOptions(orthoOptions);
            // Other options
            string encoding = options.GetProperty("encoding", "UTF-8");
            bool   toLower  = PropertiesUtils.GetBool(options, "lowerCase", false);
            // Read the file from stdin
            int  nLines    = 0;
            int  nTokens   = 0;
            long startTime = Runtime.NanoTime();

            try
            {
                ITokenizer <CoreLabel> tokenizer = tf.GetTokenizer(new InputStreamReader(Runtime.@in, encoding));
                bool printSpace = false;
                while (tokenizer.MoveNext())
                {
                    ++nTokens;
                    string word = tokenizer.Current.Word();
                    if (word.Equals(FrenchLexer.NewlineToken))
                    {
                        ++nLines;
                        printSpace = false;
                        System.Console.Out.WriteLine();
                    }
                    else
                    {
                        if (printSpace)
                        {
                            System.Console.Out.Write(" ");
                        }
                        string outputToken = toLower ? word.ToLower(Locale.French) : word;
                        System.Console.Out.Write(outputToken);
                        printSpace = true;
                    }
                }
            }
            catch (UnsupportedEncodingException e)
            {
                log.Error(e);
            }
            long   elapsedTime = Runtime.NanoTime() - startTime;
            double linesPerSec = (double)nLines / (elapsedTime / 1e9);

            System.Console.Error.Printf("Done! Tokenized %d lines (%d tokens) at %.2f lines/sec%n", nLines, nTokens, linesPerSec);
        }
Beispiel #7
0
 // todo [2017]: This should be redone sometime to not have such a hardcoded upper limit.
 // = null;
 private void SerializeDictionary(string serializePath)
 {
     logger.Info("Serializing dictionaries to " + serializePath + " ... ");
     try
     {
         ObjectOutputStream oos = IOUtils.WriteStreamFromString(serializePath);
         //oos.writeObject(MAX_LEXICON_LENGTH);
         oos.WriteObject(words_);
         //oos.writeObject(cdtos_);
         oos.Close();
         logger.Info("done.");
     }
     catch (Exception e)
     {
         logger.Error("Failed", e);
         throw new RuntimeIOException(e);
     }
 }
Beispiel #8
0
        private IList <Tree> DoOneSentence(IList <ParserConstraint> constraints, IList <CoreLabel> words)
        {
            IParserQuery pq = parser.ParserQuery();

            pq.SetConstraints(constraints);
            pq.Parse(words);
            IList <Tree> trees = Generics.NewLinkedList();

            try
            {
                // Use bestParse if kBest is set to 1.
                if (this.kBest == 1)
                {
                    Tree t = pq.GetBestParse();
                    if (t == null)
                    {
                        log.Warn("Parsing of sentence failed.  " + "Will ignore and continue: " + SentenceUtils.ListToString(words));
                    }
                    else
                    {
                        double score = pq.GetBestScore();
                        t.SetScore(score % -10000.0);
                        trees.Add(t);
                    }
                }
                else
                {
                    IList <ScoredObject <Tree> > scoredObjects = pq.GetKBestParses(this.kBest);
                    if (scoredObjects == null || scoredObjects.Count < 1)
                    {
                        log.Warn("Parsing of sentence failed.  " + "Will ignore and continue: " + SentenceUtils.ListToString(words));
                    }
                    else
                    {
                        foreach (ScoredObject <Tree> so in scoredObjects)
                        {
                            // -10000 denotes unknown words
                            Tree tree = so.Object();
                            tree.SetScore(so.Score() % -10000.0);
                            trees.Add(tree);
                        }
                    }
                }
            }
            catch (OutOfMemoryException e)
            {
                log.Error(e);
                // Beware that we can now get an OOM in logging, too.
                log.Warn("Parsing of sentence ran out of memory (length=" + words.Count + ").  " + "Will ignore and try to continue.");
            }
            catch (NoSuchParseException)
            {
                log.Warn("Parsing of sentence failed, possibly because of out of memory.  " + "Will ignore and continue: " + SentenceUtils.ListToString(words));
            }
            return(trees);
        }
        /// <summary>Start the actual server.</summary>
        /// <param name="command">the command we are using to start the sever.</param>
        /// <returns>True if the server was started; false otherwise.</returns>
        private bool StartServer(string[] command)
        {
            ProcessBuilder proc = new ProcessBuilder(command);

            try
            {
                lock (this)
                {
                    this.server = Optional.Of(new WebServiceAnnotator.RunningProcess(this, proc.Start()));
                }
                log.Info("Started server " + StringUtils.Join(command));
                return(true);
            }
            catch (IOException)
            {
                log.Error("Could not start process: " + StringUtils.Join(command));
                return(false);
            }
        }
 /// <summary>Just for testing.</summary>
 public static void Main(string[] args)
 {
     try
     {
         ITreeReader tr = new PennTreeReader(new StringReader("(S (NP (NNP Sam)) (VP (VBD died) (NP (NN today))))"), new LabeledScoredTreeFactory());
         Tree        t  = tr.ReadTree();
         System.Console.Out.WriteLine(t);
         TreeGraphNode tgn = new TreeGraphNode(t, (TreeGraphNode)null);
         System.Console.Out.WriteLine(tgn.ToPrettyString(0));
         EnglishGrammaticalStructure gs = new EnglishGrammaticalStructure(tgn);
         System.Console.Out.WriteLine(tgn.ToPrettyString(0));
         tgn.PercolateHeads(new SemanticHeadFinder());
         System.Console.Out.WriteLine(tgn.ToPrettyString(0));
     }
     catch (Exception e)
     {
         log.Error("Horrible error: " + e);
         log.Error(e);
     }
 }
 public static void Load(string filename)
 {
     try
     {
         operations = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.GetOperationsFromFile(filename, "UTF-8", new TregexPatternCompiler());
     }
     catch (IOException)
     {
         log.Error(string.Format("%s: Warning - could not load Tsurgeon file from %s.%n", typeof(Edu.Stanford.Nlp.Trees.UniversalPOSMapper).GetSimpleName(), filename));
     }
     loaded = true;
 }
 /// <summary>Read lexicon from a one-column text file.</summary>
 private void AddLexicon(string filename)
 {
     try
     {
         BufferedReader lexiconReader = new BufferedReader(new InputStreamReader(new FileInputStream(filename), "UTF-8"));
         string         lexiconLine;
         while ((lexiconLine = lexiconReader.ReadLine()) != null)
         {
             AddStringToLexicon(lexiconLine);
         }
     }
     catch (FileNotFoundException)
     {
         logger.Error("Lexicon not found: " + filename);
         System.Environment.Exit(-1);
     }
     catch (IOException e)
     {
         logger.Error("IO error while reading: " + filename, e);
         throw new Exception(e);
     }
 }
Beispiel #13
0
 public DeterministicCorefAnnotator(Properties props)
 {
     // for backward compatibility
     try
     {
         corefSystem      = new SieveCoreferenceSystem(props);
         mentionExtractor = new MentionExtractor(corefSystem.Dictionaries(), corefSystem.Semantics());
         OldFormat        = bool.Parse(props.GetProperty("oldCorefFormat", "false"));
         allowReparsing   = PropertiesUtils.GetBool(props, Constants.AllowReparsingProp, Constants.AllowReparsing);
         // unless custom mention detection is set, just use the default coref mention detector
         performMentionDetection = !PropertiesUtils.GetBool(props, "dcoref.useCustomMentionDetection", false);
         if (performMentionDetection)
         {
             mentionAnnotator = new CorefMentionAnnotator(props);
         }
     }
     catch (Exception e)
     {
         log.Error("cannot create DeterministicCorefAnnotator!");
         log.Error(e);
         throw new Exception(e);
     }
 }
Beispiel #14
0
        /// <summary>
        /// The main() method tokenizes a file in the specified Encoding
        /// and prints it to standard output in the specified Encoding.
        /// </summary>
        /// <remarks>
        /// The main() method tokenizes a file in the specified Encoding
        /// and prints it to standard output in the specified Encoding.
        /// Its arguments are (Infile, Encoding).
        /// </remarks>
        /// <exception cref="System.IO.IOException"/>
        public static void Main(string[] args)
        {
            if (args.Length < 2)
            {
                log.Error("Usage: CHTBTokenizer inputFile encoding");
            }
            string encoding = args[1];
            Reader @in      = IOUtils.ReaderFromString(args[0], encoding);

            for (ITokenizer <string> st = new Edu.Stanford.Nlp.Trees.International.Pennchinese.CHTBTokenizer(@in); st.MoveNext();)
            {
                string s = st.Current;
                EncodingPrintWriter.Out.Println(s, encoding);
            }
        }
        /// <summary>Concatenate entity annotations to a String.</summary>
        /// <remarks>
        /// Concatenate entity annotations to a String. Note that Chinese does not use space to separate
        /// tokens so we will follow this convention here.
        /// </remarks>
        /// <param name="l"/>
        /// <?/>
        /// <returns/>
        private static string SingleEntityToString <E>(IList <E> l)
            where E : ICoreMap
        {
            string        entityType = l[0].Get(typeof(CoreAnnotations.NamedEntityTagAnnotation));
            StringBuilder sb         = new StringBuilder();

            foreach (E w in l)
            {
                if (!w.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)).Equals(entityType))
                {
                    log.Error("differing NER tags detected in entity: " + l);
                    throw new Exception("Error with entity construction, two tokens had inconsistent NER tags");
                }
                sb.Append(w.Get(typeof(CoreAnnotations.TextAnnotation)));
            }
            return(sb.ToString());
        }
Beispiel #16
0
 public virtual double ValueAt(double[] x)
 {
     if (++i % outputFreq == 0)
     {
         log.Info("Storing interim (double) weights to " + filename + " ... ");
         try
         {
             DataOutputStream dos = new DataOutputStream(new BufferedOutputStream(new GZIPOutputStream(new FileOutputStream(filename))));
             ConvertByteArray.SaveDoubleArr(dos, x);
             dos.Close();
         }
         catch (IOException)
         {
             log.Error("!");
             return(1);
         }
         log.Info("DONE.");
     }
     return(0);
 }
Beispiel #17
0
 //public static SsurgeonPattern fromXML(String xmlString) throws Exception {
 //SAXBuilder builder = new SAXBuilder();
 //Document jdomDoc = builder.build(xmlString);
 //jdomDoc.getRootElement().getChildren(SsurgeonPattern.SSURGEON_ELEM_TAG);
 //}
 /// <summary>Given a target filepath and a list of Ssurgeon patterns, writes them out as XML forms.</summary>
 public static void WriteToFile(File tgtFile, IList <SsurgeonPattern> patterns)
 {
     try
     {
         IDocument domDoc = CreatePatternXMLDoc(patterns);
         if (domDoc != null)
         {
             Transformer tformer = TransformerFactory.NewInstance().NewTransformer();
             tformer.SetOutputProperty(OutputKeys.Indent, "yes");
             tformer.Transform(new DOMSource(domDoc), new StreamResult(tgtFile));
         }
         else
         {
             log.Warning("Was not able to create XML document for pattern list, file not written.");
         }
     }
     catch (Exception e)
     {
         log.Error(typeof(Edu.Stanford.Nlp.Semgraph.Semgrex.Ssurgeon.Ssurgeon).FullName, "writeToFile");
         log.Error(e);
     }
 }
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            bool unicodeToBuck       = false;
            bool outputUnicodeValues = false;
            File inputFile           = null;

            foreach (string arg in args)
            {
                if (arg.StartsWith("-"))
                {
                    switch (arg)
                    {
                    case "-u2b":
                    {
                        unicodeToBuck = true;
                        break;
                    }

                    case "-o":
                    {
                        outputUnicodeValues = true;
                        break;
                    }

                    case "-d":
                    {
                        Debug = true;
                        break;
                    }

                    default:
                    {
                        System.Console.Out.WriteLine(usage.ToString());
                        return;
                    }
                    }
                }
                else
                {
                    inputFile = new File(arg);
                    break;
                }
            }
            Edu.Stanford.Nlp.International.Arabic.Buckwalter b = new Edu.Stanford.Nlp.International.Arabic.Buckwalter(unicodeToBuck);
            b.outputUnicodeValues = outputUnicodeValues;
            int j = (b.outputUnicodeValues ? 2 : int.MaxValue);

            if (j < args.Length)
            {
                for (; j < args.Length; j++)
                {
                    EncodingPrintWriter.Out.Println(args[j] + " -> " + b.Apply(args[j]), "utf-8");
                }
            }
            else
            {
                int numLines = 0;
                try
                {
                    BufferedReader br = (inputFile == null) ? new BufferedReader(new InputStreamReader(Runtime.@in, "utf-8")) : new BufferedReader(new InputStreamReader(new FileInputStream(inputFile), "utf-8"));
                    System.Console.Error.Printf("Reading input...");
                    string line;
                    while ((line = br.ReadLine()) != null)
                    {
                        EncodingPrintWriter.Out.Println(b.Apply(line), "utf-8");
                        numLines++;
                    }
                    br.Close();
                    System.Console.Error.Printf("done.\nConverted %d lines from %s.\n", numLines, (unicodeToBuck ? "UTF-8 to Buckwalter" : "Buckwalter to UTF-8"));
                }
                catch (UnsupportedEncodingException)
                {
                    log.Error("File system does not support UTF-8 encoding.");
                }
                catch (FileNotFoundException)
                {
                    log.Error("File does not exist: " + inputFile.GetPath());
                }
                catch (IOException)
                {
                    System.Console.Error.Printf("ERROR: IO exception while reading file (line %d).\n", numLines);
                }
            }
            if (Debug)
            {
                if (!b.unmappable.KeySet().IsEmpty())
                {
                    EncodingPrintWriter.Err.Println("Characters that could not be converted [passed through!]:", "utf-8");
                    EncodingPrintWriter.Err.Println(b.unmappable.ToString(), "utf-8");
                }
                else
                {
                    EncodingPrintWriter.Err.Println("All characters successfully converted!", "utf-8");
                }
            }
        }
 /// <exception cref="System.Exception"/>
 public virtual Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> > Call()
 {
     // CollectionValuedMap<String, Integer> tokensMatchedPattern = new
     // CollectionValuedMap<String, Integer>();
     try
     {
         ICollection <CandidatePhrase> alreadyLabeledPhrases                    = new HashSet <CandidatePhrase>();
         TwoDimensionalCounter <CandidatePhrase, E>          allFreq            = new TwoDimensionalCounter <CandidatePhrase, E>();
         CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat = new CollectionValuedMap <E, Triple <string, int, int> >();
         foreach (string sentid in sentids)
         {
             IList <CoreLabel> sent = sents[sentid].GetTokens();
             foreach (KeyValuePair <TokenSequencePattern, E> pEn in patterns)
             {
                 if (pEn.Key == null)
                 {
                     throw new Exception("why is the pattern " + pEn + " null?");
                 }
                 TokenSequenceMatcher m = ((TokenSequenceMatcher)pEn.Key.GetMatcher(sent));
                 //        //Setting this find type can save time in searching - greedy and reluctant quantifiers are not enforced
                 //        m.setFindType(SequenceMatcher.FindType.FIND_ALL);
                 //Higher branch values makes the faster but uses more memory
                 m.SetBranchLimit(5);
                 while (m.Find())
                 {
                     int s = m.Start("$term");
                     int e = m.End("$term");
                     System.Diagnostics.Debug.Assert(e - s <= PatternFactory.numWordsCompoundMapped[label], "How come the pattern " + pEn.Key + " is extracting phrases longer than numWordsCompound of " + PatternFactory.numWordsCompoundMapped[label] + " for label "
                                                     + label);
                     string phrase            = string.Empty;
                     string phraseLemma       = string.Empty;
                     bool   useWordNotLabeled = false;
                     bool   doNotUse          = false;
                     //find if the neighboring words are labeled - if so - club them together
                     if (constVars.clubNeighboringLabeledWords)
                     {
                         for (int i = s - 1; i >= 0; i--)
                         {
                             if (!sent[i].Get(constVars.GetAnswerClass()[label]).Equals(label))
                             {
                                 s = i + 1;
                                 break;
                             }
                         }
                         for (int i_1 = e; i_1 < sent.Count; i_1++)
                         {
                             if (!sent[i_1].Get(constVars.GetAnswerClass()[label]).Equals(label))
                             {
                                 e = i_1;
                                 break;
                             }
                         }
                     }
                     //to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true
                     bool[] addedindices = new bool[e - s];
                     // Arrays.fill(addedindices, false); // not needed as initialized false
                     for (int i_2 = s; i_2 < e; i_2++)
                     {
                         CoreLabel l = sent[i_2];
                         l.Set(typeof(PatternsAnnotations.MatchedPattern), true);
                         if (!l.ContainsKey(typeof(PatternsAnnotations.MatchedPatterns)) || l.Get(typeof(PatternsAnnotations.MatchedPatterns)) == null)
                         {
                             l.Set(typeof(PatternsAnnotations.MatchedPatterns), new HashSet <Pattern>());
                         }
                         SurfacePattern pSur = (SurfacePattern)pEn.Value;
                         System.Diagnostics.Debug.Assert(pSur != null, "Why is " + pEn.Value + " not present in the index?!");
                         System.Diagnostics.Debug.Assert(l.Get(typeof(PatternsAnnotations.MatchedPatterns)) != null, "How come MatchedPatterns class is null for the token. The classes in the key set are " + l.KeySet());
                         l.Get(typeof(PatternsAnnotations.MatchedPatterns)).Add(pSur);
                         foreach (KeyValuePair <Type, object> ig in constVars.GetIgnoreWordswithClassesDuringSelection()[label])
                         {
                             if (l.ContainsKey(ig.Key) && l.Get(ig.Key).Equals(ig.Value))
                             {
                                 doNotUse = true;
                             }
                         }
                         bool containsStop = ContainsStopWord(l, constVars.GetCommonEngWords(), PatternFactory.ignoreWordRegex);
                         if (removePhrasesWithStopWords && containsStop)
                         {
                             doNotUse = true;
                         }
                         else
                         {
                             if (!containsStop || !removeStopWordsFromSelectedPhrases)
                             {
                                 if (label == null || l.Get(constVars.GetAnswerClass()[label]) == null || !l.Get(constVars.GetAnswerClass()[label]).Equals(label))
                                 {
                                     useWordNotLabeled = true;
                                 }
                                 phrase               += " " + l.Word();
                                 phraseLemma          += " " + l.Lemma();
                                 addedindices[i_2 - s] = true;
                             }
                         }
                     }
                     for (int i_3 = 0; i_3 < addedindices.Length; i_3++)
                     {
                         if (i_3 > 0 && i_3 < addedindices.Length - 1 && addedindices[i_3 - 1] == true && addedindices[i_3] == false && addedindices[i_3 + 1] == true)
                         {
                             doNotUse = true;
                             break;
                         }
                     }
                     if (!doNotUse)
                     {
                         matchedTokensByPat.Add(pEn.Value, new Triple <string, int, int>(sentid, s, e - 1));
                         phrase = phrase.Trim();
                         if (!phrase.IsEmpty())
                         {
                             phraseLemma = phraseLemma.Trim();
                             CandidatePhrase candPhrase = CandidatePhrase.CreateOrGet(phrase, phraseLemma);
                             allFreq.IncrementCount(candPhrase, pEn.Value, 1.0);
                             if (!useWordNotLabeled)
                             {
                                 alreadyLabeledPhrases.Add(candPhrase);
                             }
                         }
                     }
                 }
             }
         }
         return(new Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> >(allFreq, matchedTokensByPat, alreadyLabeledPhrases));
     }
     catch (Exception e)
     {
         logger.Error(e);
         throw;
     }
 }
        private void RunSegmentation(ICoreMap annotation)
        {
            //0 2
            // A BC D E
            // 1 10 1 1
            // 0 12 3 4
            // 0, 0+1 ,
            string text = annotation.Get(typeof(CoreAnnotations.TextAnnotation));
            // the original text String
            IList <CoreLabel> sentChars = annotation.Get(typeof(SegmenterCoreAnnotations.CharactersAnnotation));

            // the way it was divided by splitCharacters
            if (Verbose)
            {
                log.Info("sentChars (length " + sentChars.Count + ") is " + SentenceUtils.ListToString(sentChars, StringUtils.EmptyStringArray));
            }
            IList <CoreLabel> tokens = new List <CoreLabel>();

            annotation.Set(typeof(CoreAnnotations.TokensAnnotation), tokens);
            // Run the segmenter! On the whole String. It knows not about the splitting into chars.
            // Can we change this to have it run directly on the already existing list of tokens. That would help, no?
            IList <string> words;

            if (!tokenizeNewline)
            {
                text  = text.ReplaceAll("[\r\n]", string.Empty);
                words = segmenter.SegmentString(text);
            }
            else
            {
                // remove leading and trailing newlines
                text = text.ReplaceAll("^[\\r\\n]+", string.Empty);
                text = text.ReplaceAll("[\\r\\n]+$", string.Empty);
                // if using the sentence split on two newlines option, replace single newlines
                // single newlines should be ignored for segmenting
                if (sentenceSplitOnTwoNewlines)
                {
                    text = text.ReplaceAll("([^\\n])\\r?\\n([^\\r\\n])", "$1$2");
                    // do a second pass to handle corner case of consecutive isolated newlines
                    // x \n x \n x
                    text = text.ReplaceAll("([^\\n])\\r?\\n([^\\r\\n])", "$1$2");
                }
                // Run the segmenter on each line so that we don't get tokens that cross line boundaries
                // Neat trick to keep delimiters from: http://stackoverflow.com/a/2206432
                string[] lines = text.Split(string.Format("((?<=%1$s)|(?=%1$s))", separator));
                words = new List <string>();
                foreach (string line in lines)
                {
                    if (separatorPattern.Matcher(line).Matches())
                    {
                        // Don't segment newline tokens, keep them as-is
                        words.Add(line);
                    }
                    else
                    {
                        Sharpen.Collections.AddAll(words, segmenter.SegmentString(line));
                    }
                }
            }
            if (Verbose)
            {
                log.Info(text + "\n--->\n" + words + " (length " + words.Count + ')');
            }
            // Go through everything again and make the final tokens list; for loop is over segmented words
            int pos = 0;
            // This is used to index sentChars, the output from splitCharacters
            StringBuilder xmlBuffer = new StringBuilder();
            int           xmlBegin  = -1;

            foreach (string w in words)
            {
                CoreLabel fl = sentChars[pos];
                string    xmlCharAnnotation = fl.Get(typeof(SegmenterCoreAnnotations.XMLCharAnnotation));
                if (Verbose)
                {
                    log.Info("Working on word " + w + ", sentChar " + fl.ToShorterString() + " (sentChars index " + pos + ')');
                }
                if ("0".Equals(xmlCharAnnotation) || "beginning".Equals(xmlCharAnnotation))
                {
                    // Beginnings of plain text and other XML tags are good places to end an XML tag
                    if (xmlBuffer.Length > 0)
                    {
                        // Form the XML token
                        string    xmlTag = xmlBuffer.ToString();
                        CoreLabel fl1    = sentChars[pos - 1];
                        int       end    = fl1.Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation));
                        tokens.Add(MakeXmlToken(xmlTag, true, xmlBegin, end));
                        // Clean up and prepare for the next XML tag
                        xmlBegin  = -1;
                        xmlBuffer = new StringBuilder();
                    }
                }
                if (!"0".Equals(xmlCharAnnotation))
                {
                    // found an XML character; fl changes inside this loop!
                    while (fl.Get(typeof(SegmenterCoreAnnotations.XMLCharAnnotation)).Equals("whitespace"))
                    {
                        // Print whitespaces into the XML buffer and move on until the next non-whitespace character is found
                        // and we're in sync with segmenter output again
                        xmlBuffer.Append(' ');
                        pos += 1;
                        fl   = sentChars[pos];
                    }
                    xmlBuffer.Append(w);
                    pos = AdvancePos(sentChars, pos, w);
                    if (xmlBegin < 0)
                    {
                        xmlBegin = fl.Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation));
                    }
                    continue;
                }
                // remember that fl may be more than one char long (non-BMP chars like emoji), so use advancePos()
                fl.Set(typeof(CoreAnnotations.ChineseSegAnnotation), "1");
                if (w.IsEmpty())
                {
                    if (Verbose)
                    {
                        log.Warn("Encountered an empty word. Shouldn't happen?");
                    }
                    continue;
                }
                // [cdm 2016:] surely this shouldn't happen!
                int begin = fl.Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation));
                pos = AdvancePos(sentChars, pos, w);
                if (pos - 1 >= sentChars.Count)
                {
                    log.Error("Error: on word " + w + " at position " + (pos - w.Length) + " trying to get at position " + (pos - 1));
                    log.Error("last element of sentChars is " + sentChars[sentChars.Count - 1]);
                }
                else
                {
                    fl = sentChars[pos - 1];
                    int end = fl.Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation));
                    tokens.Add(MakeXmlToken(w, false, begin, end));
                }
            }
            // end for (go through everything again)
            if (xmlBuffer.Length > 0)
            {
                // Form the last XML token, if any
                string    xmlTag = xmlBuffer.ToString();
                CoreLabel fl1    = sentChars[pos - 1];
                int       end    = fl1.Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation));
                tokens.Add(MakeXmlToken(xmlTag, true, xmlBegin, end));
            }
            if (Verbose)
            {
                foreach (CoreLabel token in tokens)
                {
                    log.Info(token.ToShorterString());
                }
            }
        }
Beispiel #21
0
        /// <summary>
        /// Evaluate performance on a list of sentences, predicted parses,
        /// and gold parses.
        /// </summary>
        /// <returns>A map from metric name to metric value</returns>
        public virtual IDictionary <string, double> Evaluate(IList <ICoreMap> sentences, IList <DependencyTree> trees, IList <DependencyTree> goldTrees)
        {
            IDictionary <string, double> result = new Dictionary <string, double>();
            // We'll skip words which are punctuation. Retrieve tags indicating
            // punctuation in this treebank.
            ICollection <string> punctuationTags = GetPunctuationTags();

            if (trees.Count != goldTrees.Count)
            {
                log.Error("Incorrect number of trees.");
                return(null);
            }
            int correctArcs        = 0;
            int correctArcsNoPunc  = 0;
            int correctHeads       = 0;
            int correctHeadsNoPunc = 0;
            int correctTrees       = 0;
            int correctTreesNoPunc = 0;
            int correctRoot        = 0;
            int sumArcs            = 0;
            int sumArcsNoPunc      = 0;

            for (int i = 0; i < trees.Count; ++i)
            {
                IList <CoreLabel> tokens = sentences[i].Get(typeof(CoreAnnotations.TokensAnnotation));
                if (trees[i].n != goldTrees[i].n)
                {
                    log.Error("Tree " + (i + 1) + ": incorrect number of nodes.");
                    return(null);
                }
                if (!trees[i].IsTree())
                {
                    log.Error("Tree " + (i + 1) + ": illegal.");
                    return(null);
                }
                int nCorrectHead       = 0;
                int nCorrectHeadNoPunc = 0;
                int nNoPunc            = 0;
                for (int j = 1; j <= trees[i].n; ++j)
                {
                    if (trees[i].GetHead(j) == goldTrees[i].GetHead(j))
                    {
                        ++correctHeads;
                        ++nCorrectHead;
                        if (trees[i].GetLabel(j).Equals(goldTrees[i].GetLabel(j)))
                        {
                            ++correctArcs;
                        }
                    }
                    ++sumArcs;
                    string tag = tokens[j - 1].Tag();
                    if (!punctuationTags.Contains(tag))
                    {
                        ++sumArcsNoPunc;
                        ++nNoPunc;
                        if (trees[i].GetHead(j) == goldTrees[i].GetHead(j))
                        {
                            ++correctHeadsNoPunc;
                            ++nCorrectHeadNoPunc;
                            if (trees[i].GetLabel(j).Equals(goldTrees[i].GetLabel(j)))
                            {
                                ++correctArcsNoPunc;
                            }
                        }
                    }
                }
                if (nCorrectHead == trees[i].n)
                {
                    ++correctTrees;
                }
                if (nCorrectHeadNoPunc == nNoPunc)
                {
                    ++correctTreesNoPunc;
                }
                if (trees[i].GetRoot() == goldTrees[i].GetRoot())
                {
                    ++correctRoot;
                }
            }
            result["UAS"]       = correctHeads * 100.0 / sumArcs;
            result["UASnoPunc"] = correctHeadsNoPunc * 100.0 / sumArcsNoPunc;
            result["LAS"]       = correctArcs * 100.0 / sumArcs;
            result["LASnoPunc"] = correctArcsNoPunc * 100.0 / sumArcsNoPunc;
            result["UEM"]       = correctTrees * 100.0 / trees.Count;
            result["UEMnoPunc"] = correctTreesNoPunc * 100.0 / trees.Count;
            result["ROOT"]      = correctRoot * 100.0 / trees.Count;
            return(result);
        }
Beispiel #22
0
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length < MinArgs)
            {
                log.Info(Usage());
                System.Environment.Exit(-1);
            }
            Properties options        = StringUtils.ArgsToProperties(args, OptionArgDefs());
            bool       Verbose        = PropertiesUtils.GetBool(options, "v", false);
            File       testTreebank   = options.Contains("t") ? new File(options.GetProperty("t")) : null;
            int        maxGoldSentLen = PropertiesUtils.GetInt(options, "l", int.MaxValue);
            bool       SerInput       = PropertiesUtils.GetBool(options, "o", false);

            string[] parsedArgs = options.GetProperty(string.Empty, string.Empty).Split("\\s+");
            if (parsedArgs.Length != MinArgs)
            {
                log.Info(Usage());
                System.Environment.Exit(-1);
            }
            File     trainTreebank = new File(parsedArgs[0]);
            DateTime startTime     = new DateTime();

            log.Info("###################################");
            log.Info("### Joint Segmentation / Parser ###");
            log.Info("###################################");
            System.Console.Error.Printf("Start time: %s\n", startTime);
            JointParsingModel parsingModel = new JointParsingModel();

            parsingModel.SetVerbose(Verbose);
            parsingModel.SetMaxEvalSentLen(maxGoldSentLen);
            parsingModel.SetSerInput(SerInput);
            //WSGDEBUG -- Some stuff for eclipse debugging
            InputStream inputStream = null;

            try
            {
                if (Runtime.GetProperty("eclipse") == null)
                {
                    inputStream = (SerInput) ? new ObjectInputStream(new GZIPInputStream(Runtime.@in)) : Runtime.@in;
                }
                else
                {
                    FileInputStream fileStream = new FileInputStream(new File("debug.2.xml"));
                    inputStream = (SerInput) ? new ObjectInputStream(new GZIPInputStream(fileStream)) : fileStream;
                }
            }
            catch (IOException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
                System.Environment.Exit(-1);
            }
            finally
            {
                if (inputStream != null)
                {
                    try
                    {
                        inputStream.Close();
                    }
                    catch (IOException)
                    {
                    }
                }
            }
            if (!trainTreebank.Exists())
            {
                log.Info("Training treebank does not exist!\n  " + trainTreebank.GetPath());
            }
            else
            {
                if (testTreebank != null && !testTreebank.Exists())
                {
                    log.Info("Test treebank does not exist!\n  " + testTreebank.GetPath());
                }
                else
                {
                    if (parsingModel.Run(trainTreebank, testTreebank, inputStream))
                    {
                        log.Info("Successful shutdown!");
                    }
                    else
                    {
                        log.Error("Parsing model failure.");
                    }
                }
            }
            DateTime stopTime    = new DateTime();
            long     elapsedTime = stopTime.GetTime() - startTime.GetTime();

            log.Info();
            log.Info();
            System.Console.Error.Printf("Completed processing at %s\n", stopTime);
            System.Console.Error.Printf("Elapsed time: %d seconds\n", (int)(elapsedTime / 1000F));
        }
Beispiel #23
0
        /// <summary>TODO Can add various signatures, setting the signature via Options.</summary>
        /// <param name="word">The word to make a signature for</param>
        /// <param name="loc">
        /// Its position in the sentence (mainly so sentence-initial
        /// capitalized words can be treated differently)
        /// </param>
        /// <returns>A String that is its signature (equivalence class)</returns>
        public override string GetSignature(string word, int loc)
        {
            string        BaseLabel = "UNK";
            StringBuilder sb        = new StringBuilder(BaseLabel);

            switch (unknownLevel)
            {
            case 1:
            {
                if (StringUtils.IsNumeric(word))
                {
                    sb.Append('#');
                    break;
                }
                else
                {
                    if (StringUtils.IsPunct(word))
                    {
                        sb.Append('!');
                        break;
                    }
                }
                // Mutually exclusive patterns
                sb.Append(SpanishUnknownWordSignatures.ConditionalSuffix(word));
                sb.Append(SpanishUnknownWordSignatures.ImperfectSuffix(word));
                sb.Append(SpanishUnknownWordSignatures.InfinitiveSuffix(word));
                sb.Append(SpanishUnknownWordSignatures.AdverbSuffix(word));
                // Broad coverage patterns -- only apply if we haven't yet matched at all
                if (sb.ToString().Equals(BaseLabel))
                {
                    if (SpanishUnknownWordSignatures.HasVerbFirstPersonPluralSuffix(word))
                    {
                        sb.Append("-vb1p");
                    }
                    else
                    {
                        if (SpanishUnknownWordSignatures.HasGerundSuffix(word))
                        {
                            sb.Append("-ger");
                        }
                        else
                        {
                            if (word.EndsWith("s"))
                            {
                                sb.Append("-s");
                            }
                        }
                    }
                }
                // Backoff to suffix if we haven't matched anything else
                if (unknownSuffixSize > 0 && sb.ToString().Equals(BaseLabel))
                {
                    int min = word.Length < unknownSuffixSize ? word.Length : unknownSuffixSize;
                    sb.Append('-').Append(Sharpen.Runtime.Substring(word, word.Length - min));
                }
                char first = word[0];
                if ((char.IsUpperCase(first) || char.IsTitleCase(first)) && !IsUpperCase(word))
                {
                    sb.Append("-C");
                }
                else
                {
                    sb.Append("-c");
                }
                break;
            }

            default:
            {
                log.Error(string.Format("%s: Invalid unknown word signature! (%d)%n", this.GetType().FullName, unknownLevel));
                break;
            }
            }
            return(sb.ToString());
        }
        public override void Display(bool verbose, PrintWriter pw)
        {
            if (precisions.KeySet().Count != recalls.KeySet().Count)
            {
                log.Error("Different counts for precisions and recalls!");
                return;
            }
            ICollection <ILabel> cats = GetEvalLabelSet(precisions.KeySet());
            Random rand = new Random();
            IDictionary <double, ILabel> f1Map = new SortedDictionary <double, ILabel>();

            foreach (ILabel cat in cats)
            {
                double pnum2 = pnums2.GetCount(cat);
                double rnum2 = rnums2.GetCount(cat);
                double prec  = precisions2.GetCount(cat) / pnum2;
                double rec   = recalls2.GetCount(cat) / rnum2;
                double f1    = 2.0 / (1.0 / prec + 1.0 / rec);
                if (f1.Equals(double.NaN))
                {
                    f1 = -1.0;
                }
                if (f1Map.Contains(f1))
                {
                    f1Map[f1 + (rand.NextDouble() / 1000.0)] = cat;
                }
                else
                {
                    f1Map[f1] = cat;
                }
            }
            pw.Println("============================================================");
            pw.Println("Labeled Bracketed Evaluation by Category -- final statistics");
            pw.Println("============================================================");
            // Per category
            double catPrecisions    = 0.0;
            double catPrecisionNums = 0.0;
            double catRecalls       = 0.0;
            double catRecallNums    = 0.0;

            foreach (ILabel cat_1 in f1Map.Values)
            {
                double pnum2 = pnums2.GetCount(cat_1);
                double rnum2 = rnums2.GetCount(cat_1);
                double prec  = precisions2.GetCount(cat_1) / pnum2;
                prec *= 100.0;
                double rec = recalls2.GetCount(cat_1) / rnum2;
                rec *= 100.0;
                double f1 = 2.0 / (1.0 / prec + 1.0 / rec);
                catPrecisions    += precisions2.GetCount(cat_1);
                catPrecisionNums += pnum2;
                catRecalls       += recalls2.GetCount(cat_1);
                catRecallNums    += rnum2;
                string Lp = pnum2 == 0.0 ? "N/A" : string.Format("%.2f", prec);
                string Lr = rnum2 == 0.0 ? "N/A" : string.Format("%.2f", rec);
                string F1 = (pnum2 == 0.0 || rnum2 == 0.0) ? "N/A" : string.Format("%.2f", f1);
                pw.Printf("%s\tLP: %s\tguessed: %d\tLR: %s\tgold: %d\t F1: %s%n", cat_1.Value(), Lp, (int)pnum2, Lr, (int)rnum2, F1);
            }
            pw.Println("============================================================");
            // Totals
            double prec_1 = catPrecisions / catPrecisionNums;
            double rec_1  = catRecalls / catRecallNums;
            double f1_1   = (2 * prec_1 * rec_1) / (prec_1 + rec_1);

            pw.Printf("Total\tLP: %.2f\tguessed: %d\tLR: %.2f\tgold: %d\t F1: %.2f%n", prec_1 * 100.0, (int)catPrecisionNums, rec_1 * 100.0, (int)catRecallNums, f1_1 * 100.0);
            pw.Println("============================================================");
        }