public HybridCorefAnnotator(Properties props) { // for backward compatibility try { // Load the default properties Properties corefProps = new Properties(); try { using (BufferedReader reader = IOUtils.ReaderFromString("edu/stanford/nlp/hcoref/properties/coref-default-dep.properties")) { corefProps.Load(reader); } } catch (IOException) { } // Add passed properties IEnumeration <object> keys = props.Keys; while (keys.MoveNext()) { string key = keys.Current.ToString(); corefProps.SetProperty(key, props.GetProperty(key)); } // Create coref system corefSystem = new HybridCorefSystem(corefProps); OldFormat = bool.ParseBoolean(props.GetProperty("oldCorefFormat", "false")); } catch (Exception e) { log.Error("cannot create HybridCorefAnnotator!"); Sharpen.Runtime.PrintStackTrace(e); throw new Exception(e); } }
public CorefAnnotator(Properties props) { this.props = props; try { // if user tries to run with coref.language = ENGLISH and coref.algorithm = hybrid, throw Exception // we do not support those settings at this time if (CorefProperties.Algorithm(props).Equals(CorefProperties.CorefAlgorithmType.Hybrid) && CorefProperties.GetLanguage(props).Equals(Locale.English)) { log.Error("Error: coref.algorithm=hybrid is not supported for English, " + "please change coref.algorithm or coref.language"); throw new Exception(); } // suppress props.SetProperty("coref.printConLLLoadingMessage", "false"); corefSystem = new CorefSystem(props); props.Remove("coref.printConLLLoadingMessage"); } catch (Exception e) { log.Error("Error creating CorefAnnotator...terminating pipeline construction!"); log.Error(e); throw new Exception(e); } // unless custom mention detection is set, just use the default coref mention detector performMentionDetection = !PropertiesUtils.GetBool(props, "coref.useCustomMentionDetection", false); if (performMentionDetection) { mentionAnnotator = new CorefMentionAnnotator(props); } }
public EntityMentionsAnnotator(string name, Properties props) { // note: used in annotate.properties // if the user has supplied custom CoreAnnotations for the ner tags and entity mentions override the default keys try { if (props.Contains(name + ".nerCoreAnnotation")) { nerCoreAnnotationClass = (Type)Sharpen.Runtime.GetType(props.GetProperty(name + ".nerCoreAnnotation")); } if (props.Contains(name + ".nerNormalizedCoreAnnotation")) { nerNormalizedCoreAnnotationClass = (Type)Sharpen.Runtime.GetType(props.GetProperty(name + ".nerNormalizedCoreAnnotation")); } if (props.Contains(name + ".mentionsCoreAnnotation")) { mentionsCoreAnnotationClass = (Type)Sharpen.Runtime.GetType(props.GetProperty(name + ".mentionsCoreAnnotation")); } } catch (TypeLoadException e) { log.Error(e.Message); } chunkIdentifier = new LabeledChunkIdentifier(); doAcronyms = bool.ParseBoolean(props.GetProperty(name + ".acronyms", props.GetProperty("acronyms", "false"))); // set up language info, this is needed for handling creating pronominal mentions entityMentionsLanguage = LanguageInfo.GetLanguageFromString(props.GetProperty(name + ".language", "en")); }
private double InterpretCmdOutput() { string output = GetOutput(); string[] parts = output.Split("\\s+"); int fScoreIndex = 0; for (; fScoreIndex < parts.Length; fScoreIndex++) { if (parts[fScoreIndex].Equals("FB1:")) { break; } } fScoreIndex += 1; if (fScoreIndex < parts.Length) { return(double.ParseDouble(parts[fScoreIndex])); } else { log.Error("in CRFClassifierEvaluator.interpretCmdOutput(), cannot find FB1 score in output:\n" + output); return(-1); } }
private ICoreMap DoOneSentence(ICoreMap sentence) { IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)); IList <TaggedWord> tagged = null; if (tokens.Count <= maxSentenceLength) { try { tagged = pos.TagSentence(tokens, this.reuseTags); } catch (OutOfMemoryException e) { log.Error(e); // Beware that we can now get an OOM in logging, too. log.Warn("Tagging of sentence ran out of memory. " + "Will ignore and continue: " + SentenceUtils.ListToString(tokens)); } } if (tagged != null) { for (int i = 0; i < sz; i++) { tokens[i].Set(typeof(CoreAnnotations.PartOfSpeechAnnotation), tagged[i].Tag()); } } else { foreach (CoreLabel token in tokens) { token.Set(typeof(CoreAnnotations.PartOfSpeechAnnotation), "X"); } } return(sentence); }
/// <summary>A fast, rule-based tokenizer for Modern Standard French.</summary> /// <remarks> /// A fast, rule-based tokenizer for Modern Standard French. /// Performs punctuation splitting and light tokenization by default. /// <p> /// Currently, this tokenizer does not do line splitting. It assumes that the input /// file is delimited by the system line separator. The output will be equivalently /// delimited. /// </remarks> /// <param name="args"/> public static void Main(string[] args) { Properties options = StringUtils.ArgsToProperties(args, ArgOptionDefs()); if (options.Contains("help")) { log.Info(Usage()); return; } // Lexer options ITokenizerFactory <CoreLabel> tf = options.Contains("ftb") ? FrenchTokenizer.FtbFactory() : FrenchTokenizer.Factory(); string orthoOptions = options.GetProperty("options", string.Empty); // When called from this main method, split on newline. No options for // more granular sentence splitting. orthoOptions = orthoOptions.IsEmpty() ? "tokenizeNLs" : orthoOptions + ",tokenizeNLs"; tf.SetOptions(orthoOptions); // Other options string encoding = options.GetProperty("encoding", "UTF-8"); bool toLower = PropertiesUtils.GetBool(options, "lowerCase", false); // Read the file from stdin int nLines = 0; int nTokens = 0; long startTime = Runtime.NanoTime(); try { ITokenizer <CoreLabel> tokenizer = tf.GetTokenizer(new InputStreamReader(Runtime.@in, encoding)); bool printSpace = false; while (tokenizer.MoveNext()) { ++nTokens; string word = tokenizer.Current.Word(); if (word.Equals(FrenchLexer.NewlineToken)) { ++nLines; printSpace = false; System.Console.Out.WriteLine(); } else { if (printSpace) { System.Console.Out.Write(" "); } string outputToken = toLower ? word.ToLower(Locale.French) : word; System.Console.Out.Write(outputToken); printSpace = true; } } } catch (UnsupportedEncodingException e) { log.Error(e); } long elapsedTime = Runtime.NanoTime() - startTime; double linesPerSec = (double)nLines / (elapsedTime / 1e9); System.Console.Error.Printf("Done! Tokenized %d lines (%d tokens) at %.2f lines/sec%n", nLines, nTokens, linesPerSec); }
// todo [2017]: This should be redone sometime to not have such a hardcoded upper limit. // = null; private void SerializeDictionary(string serializePath) { logger.Info("Serializing dictionaries to " + serializePath + " ... "); try { ObjectOutputStream oos = IOUtils.WriteStreamFromString(serializePath); //oos.writeObject(MAX_LEXICON_LENGTH); oos.WriteObject(words_); //oos.writeObject(cdtos_); oos.Close(); logger.Info("done."); } catch (Exception e) { logger.Error("Failed", e); throw new RuntimeIOException(e); } }
private IList <Tree> DoOneSentence(IList <ParserConstraint> constraints, IList <CoreLabel> words) { IParserQuery pq = parser.ParserQuery(); pq.SetConstraints(constraints); pq.Parse(words); IList <Tree> trees = Generics.NewLinkedList(); try { // Use bestParse if kBest is set to 1. if (this.kBest == 1) { Tree t = pq.GetBestParse(); if (t == null) { log.Warn("Parsing of sentence failed. " + "Will ignore and continue: " + SentenceUtils.ListToString(words)); } else { double score = pq.GetBestScore(); t.SetScore(score % -10000.0); trees.Add(t); } } else { IList <ScoredObject <Tree> > scoredObjects = pq.GetKBestParses(this.kBest); if (scoredObjects == null || scoredObjects.Count < 1) { log.Warn("Parsing of sentence failed. " + "Will ignore and continue: " + SentenceUtils.ListToString(words)); } else { foreach (ScoredObject <Tree> so in scoredObjects) { // -10000 denotes unknown words Tree tree = so.Object(); tree.SetScore(so.Score() % -10000.0); trees.Add(tree); } } } } catch (OutOfMemoryException e) { log.Error(e); // Beware that we can now get an OOM in logging, too. log.Warn("Parsing of sentence ran out of memory (length=" + words.Count + "). " + "Will ignore and try to continue."); } catch (NoSuchParseException) { log.Warn("Parsing of sentence failed, possibly because of out of memory. " + "Will ignore and continue: " + SentenceUtils.ListToString(words)); } return(trees); }
/// <summary>Start the actual server.</summary> /// <param name="command">the command we are using to start the sever.</param> /// <returns>True if the server was started; false otherwise.</returns> private bool StartServer(string[] command) { ProcessBuilder proc = new ProcessBuilder(command); try { lock (this) { this.server = Optional.Of(new WebServiceAnnotator.RunningProcess(this, proc.Start())); } log.Info("Started server " + StringUtils.Join(command)); return(true); } catch (IOException) { log.Error("Could not start process: " + StringUtils.Join(command)); return(false); } }
/// <summary>Just for testing.</summary> public static void Main(string[] args) { try { ITreeReader tr = new PennTreeReader(new StringReader("(S (NP (NNP Sam)) (VP (VBD died) (NP (NN today))))"), new LabeledScoredTreeFactory()); Tree t = tr.ReadTree(); System.Console.Out.WriteLine(t); TreeGraphNode tgn = new TreeGraphNode(t, (TreeGraphNode)null); System.Console.Out.WriteLine(tgn.ToPrettyString(0)); EnglishGrammaticalStructure gs = new EnglishGrammaticalStructure(tgn); System.Console.Out.WriteLine(tgn.ToPrettyString(0)); tgn.PercolateHeads(new SemanticHeadFinder()); System.Console.Out.WriteLine(tgn.ToPrettyString(0)); } catch (Exception e) { log.Error("Horrible error: " + e); log.Error(e); } }
public static void Load(string filename) { try { operations = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.GetOperationsFromFile(filename, "UTF-8", new TregexPatternCompiler()); } catch (IOException) { log.Error(string.Format("%s: Warning - could not load Tsurgeon file from %s.%n", typeof(Edu.Stanford.Nlp.Trees.UniversalPOSMapper).GetSimpleName(), filename)); } loaded = true; }
/// <summary>Read lexicon from a one-column text file.</summary> private void AddLexicon(string filename) { try { BufferedReader lexiconReader = new BufferedReader(new InputStreamReader(new FileInputStream(filename), "UTF-8")); string lexiconLine; while ((lexiconLine = lexiconReader.ReadLine()) != null) { AddStringToLexicon(lexiconLine); } } catch (FileNotFoundException) { logger.Error("Lexicon not found: " + filename); System.Environment.Exit(-1); } catch (IOException e) { logger.Error("IO error while reading: " + filename, e); throw new Exception(e); } }
public DeterministicCorefAnnotator(Properties props) { // for backward compatibility try { corefSystem = new SieveCoreferenceSystem(props); mentionExtractor = new MentionExtractor(corefSystem.Dictionaries(), corefSystem.Semantics()); OldFormat = bool.Parse(props.GetProperty("oldCorefFormat", "false")); allowReparsing = PropertiesUtils.GetBool(props, Constants.AllowReparsingProp, Constants.AllowReparsing); // unless custom mention detection is set, just use the default coref mention detector performMentionDetection = !PropertiesUtils.GetBool(props, "dcoref.useCustomMentionDetection", false); if (performMentionDetection) { mentionAnnotator = new CorefMentionAnnotator(props); } } catch (Exception e) { log.Error("cannot create DeterministicCorefAnnotator!"); log.Error(e); throw new Exception(e); } }
/// <summary> /// The main() method tokenizes a file in the specified Encoding /// and prints it to standard output in the specified Encoding. /// </summary> /// <remarks> /// The main() method tokenizes a file in the specified Encoding /// and prints it to standard output in the specified Encoding. /// Its arguments are (Infile, Encoding). /// </remarks> /// <exception cref="System.IO.IOException"/> public static void Main(string[] args) { if (args.Length < 2) { log.Error("Usage: CHTBTokenizer inputFile encoding"); } string encoding = args[1]; Reader @in = IOUtils.ReaderFromString(args[0], encoding); for (ITokenizer <string> st = new Edu.Stanford.Nlp.Trees.International.Pennchinese.CHTBTokenizer(@in); st.MoveNext();) { string s = st.Current; EncodingPrintWriter.Out.Println(s, encoding); } }
/// <summary>Concatenate entity annotations to a String.</summary> /// <remarks> /// Concatenate entity annotations to a String. Note that Chinese does not use space to separate /// tokens so we will follow this convention here. /// </remarks> /// <param name="l"/> /// <?/> /// <returns/> private static string SingleEntityToString <E>(IList <E> l) where E : ICoreMap { string entityType = l[0].Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)); StringBuilder sb = new StringBuilder(); foreach (E w in l) { if (!w.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)).Equals(entityType)) { log.Error("differing NER tags detected in entity: " + l); throw new Exception("Error with entity construction, two tokens had inconsistent NER tags"); } sb.Append(w.Get(typeof(CoreAnnotations.TextAnnotation))); } return(sb.ToString()); }
public virtual double ValueAt(double[] x) { if (++i % outputFreq == 0) { log.Info("Storing interim (double) weights to " + filename + " ... "); try { DataOutputStream dos = new DataOutputStream(new BufferedOutputStream(new GZIPOutputStream(new FileOutputStream(filename)))); ConvertByteArray.SaveDoubleArr(dos, x); dos.Close(); } catch (IOException) { log.Error("!"); return(1); } log.Info("DONE."); } return(0); }
//public static SsurgeonPattern fromXML(String xmlString) throws Exception { //SAXBuilder builder = new SAXBuilder(); //Document jdomDoc = builder.build(xmlString); //jdomDoc.getRootElement().getChildren(SsurgeonPattern.SSURGEON_ELEM_TAG); //} /// <summary>Given a target filepath and a list of Ssurgeon patterns, writes them out as XML forms.</summary> public static void WriteToFile(File tgtFile, IList <SsurgeonPattern> patterns) { try { IDocument domDoc = CreatePatternXMLDoc(patterns); if (domDoc != null) { Transformer tformer = TransformerFactory.NewInstance().NewTransformer(); tformer.SetOutputProperty(OutputKeys.Indent, "yes"); tformer.Transform(new DOMSource(domDoc), new StreamResult(tgtFile)); } else { log.Warning("Was not able to create XML document for pattern list, file not written."); } } catch (Exception e) { log.Error(typeof(Edu.Stanford.Nlp.Semgraph.Semgrex.Ssurgeon.Ssurgeon).FullName, "writeToFile"); log.Error(e); } }
/// <param name="args"/> public static void Main(string[] args) { bool unicodeToBuck = false; bool outputUnicodeValues = false; File inputFile = null; foreach (string arg in args) { if (arg.StartsWith("-")) { switch (arg) { case "-u2b": { unicodeToBuck = true; break; } case "-o": { outputUnicodeValues = true; break; } case "-d": { Debug = true; break; } default: { System.Console.Out.WriteLine(usage.ToString()); return; } } } else { inputFile = new File(arg); break; } } Edu.Stanford.Nlp.International.Arabic.Buckwalter b = new Edu.Stanford.Nlp.International.Arabic.Buckwalter(unicodeToBuck); b.outputUnicodeValues = outputUnicodeValues; int j = (b.outputUnicodeValues ? 2 : int.MaxValue); if (j < args.Length) { for (; j < args.Length; j++) { EncodingPrintWriter.Out.Println(args[j] + " -> " + b.Apply(args[j]), "utf-8"); } } else { int numLines = 0; try { BufferedReader br = (inputFile == null) ? new BufferedReader(new InputStreamReader(Runtime.@in, "utf-8")) : new BufferedReader(new InputStreamReader(new FileInputStream(inputFile), "utf-8")); System.Console.Error.Printf("Reading input..."); string line; while ((line = br.ReadLine()) != null) { EncodingPrintWriter.Out.Println(b.Apply(line), "utf-8"); numLines++; } br.Close(); System.Console.Error.Printf("done.\nConverted %d lines from %s.\n", numLines, (unicodeToBuck ? "UTF-8 to Buckwalter" : "Buckwalter to UTF-8")); } catch (UnsupportedEncodingException) { log.Error("File system does not support UTF-8 encoding."); } catch (FileNotFoundException) { log.Error("File does not exist: " + inputFile.GetPath()); } catch (IOException) { System.Console.Error.Printf("ERROR: IO exception while reading file (line %d).\n", numLines); } } if (Debug) { if (!b.unmappable.KeySet().IsEmpty()) { EncodingPrintWriter.Err.Println("Characters that could not be converted [passed through!]:", "utf-8"); EncodingPrintWriter.Err.Println(b.unmappable.ToString(), "utf-8"); } else { EncodingPrintWriter.Err.Println("All characters successfully converted!", "utf-8"); } } }
/// <exception cref="System.Exception"/> public virtual Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> > Call() { // CollectionValuedMap<String, Integer> tokensMatchedPattern = new // CollectionValuedMap<String, Integer>(); try { ICollection <CandidatePhrase> alreadyLabeledPhrases = new HashSet <CandidatePhrase>(); TwoDimensionalCounter <CandidatePhrase, E> allFreq = new TwoDimensionalCounter <CandidatePhrase, E>(); CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat = new CollectionValuedMap <E, Triple <string, int, int> >(); foreach (string sentid in sentids) { IList <CoreLabel> sent = sents[sentid].GetTokens(); foreach (KeyValuePair <TokenSequencePattern, E> pEn in patterns) { if (pEn.Key == null) { throw new Exception("why is the pattern " + pEn + " null?"); } TokenSequenceMatcher m = ((TokenSequenceMatcher)pEn.Key.GetMatcher(sent)); // //Setting this find type can save time in searching - greedy and reluctant quantifiers are not enforced // m.setFindType(SequenceMatcher.FindType.FIND_ALL); //Higher branch values makes the faster but uses more memory m.SetBranchLimit(5); while (m.Find()) { int s = m.Start("$term"); int e = m.End("$term"); System.Diagnostics.Debug.Assert(e - s <= PatternFactory.numWordsCompoundMapped[label], "How come the pattern " + pEn.Key + " is extracting phrases longer than numWordsCompound of " + PatternFactory.numWordsCompoundMapped[label] + " for label " + label); string phrase = string.Empty; string phraseLemma = string.Empty; bool useWordNotLabeled = false; bool doNotUse = false; //find if the neighboring words are labeled - if so - club them together if (constVars.clubNeighboringLabeledWords) { for (int i = s - 1; i >= 0; i--) { if (!sent[i].Get(constVars.GetAnswerClass()[label]).Equals(label)) { s = i + 1; break; } } for (int i_1 = e; i_1 < sent.Count; i_1++) { if (!sent[i_1].Get(constVars.GetAnswerClass()[label]).Equals(label)) { e = i_1; break; } } } //to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true bool[] addedindices = new bool[e - s]; // Arrays.fill(addedindices, false); // not needed as initialized false for (int i_2 = s; i_2 < e; i_2++) { CoreLabel l = sent[i_2]; l.Set(typeof(PatternsAnnotations.MatchedPattern), true); if (!l.ContainsKey(typeof(PatternsAnnotations.MatchedPatterns)) || l.Get(typeof(PatternsAnnotations.MatchedPatterns)) == null) { l.Set(typeof(PatternsAnnotations.MatchedPatterns), new HashSet <Pattern>()); } SurfacePattern pSur = (SurfacePattern)pEn.Value; System.Diagnostics.Debug.Assert(pSur != null, "Why is " + pEn.Value + " not present in the index?!"); System.Diagnostics.Debug.Assert(l.Get(typeof(PatternsAnnotations.MatchedPatterns)) != null, "How come MatchedPatterns class is null for the token. The classes in the key set are " + l.KeySet()); l.Get(typeof(PatternsAnnotations.MatchedPatterns)).Add(pSur); foreach (KeyValuePair <Type, object> ig in constVars.GetIgnoreWordswithClassesDuringSelection()[label]) { if (l.ContainsKey(ig.Key) && l.Get(ig.Key).Equals(ig.Value)) { doNotUse = true; } } bool containsStop = ContainsStopWord(l, constVars.GetCommonEngWords(), PatternFactory.ignoreWordRegex); if (removePhrasesWithStopWords && containsStop) { doNotUse = true; } else { if (!containsStop || !removeStopWordsFromSelectedPhrases) { if (label == null || l.Get(constVars.GetAnswerClass()[label]) == null || !l.Get(constVars.GetAnswerClass()[label]).Equals(label)) { useWordNotLabeled = true; } phrase += " " + l.Word(); phraseLemma += " " + l.Lemma(); addedindices[i_2 - s] = true; } } } for (int i_3 = 0; i_3 < addedindices.Length; i_3++) { if (i_3 > 0 && i_3 < addedindices.Length - 1 && addedindices[i_3 - 1] == true && addedindices[i_3] == false && addedindices[i_3 + 1] == true) { doNotUse = true; break; } } if (!doNotUse) { matchedTokensByPat.Add(pEn.Value, new Triple <string, int, int>(sentid, s, e - 1)); phrase = phrase.Trim(); if (!phrase.IsEmpty()) { phraseLemma = phraseLemma.Trim(); CandidatePhrase candPhrase = CandidatePhrase.CreateOrGet(phrase, phraseLemma); allFreq.IncrementCount(candPhrase, pEn.Value, 1.0); if (!useWordNotLabeled) { alreadyLabeledPhrases.Add(candPhrase); } } } } } } return(new Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> >(allFreq, matchedTokensByPat, alreadyLabeledPhrases)); } catch (Exception e) { logger.Error(e); throw; } }
private void RunSegmentation(ICoreMap annotation) { //0 2 // A BC D E // 1 10 1 1 // 0 12 3 4 // 0, 0+1 , string text = annotation.Get(typeof(CoreAnnotations.TextAnnotation)); // the original text String IList <CoreLabel> sentChars = annotation.Get(typeof(SegmenterCoreAnnotations.CharactersAnnotation)); // the way it was divided by splitCharacters if (Verbose) { log.Info("sentChars (length " + sentChars.Count + ") is " + SentenceUtils.ListToString(sentChars, StringUtils.EmptyStringArray)); } IList <CoreLabel> tokens = new List <CoreLabel>(); annotation.Set(typeof(CoreAnnotations.TokensAnnotation), tokens); // Run the segmenter! On the whole String. It knows not about the splitting into chars. // Can we change this to have it run directly on the already existing list of tokens. That would help, no? IList <string> words; if (!tokenizeNewline) { text = text.ReplaceAll("[\r\n]", string.Empty); words = segmenter.SegmentString(text); } else { // remove leading and trailing newlines text = text.ReplaceAll("^[\\r\\n]+", string.Empty); text = text.ReplaceAll("[\\r\\n]+$", string.Empty); // if using the sentence split on two newlines option, replace single newlines // single newlines should be ignored for segmenting if (sentenceSplitOnTwoNewlines) { text = text.ReplaceAll("([^\\n])\\r?\\n([^\\r\\n])", "$1$2"); // do a second pass to handle corner case of consecutive isolated newlines // x \n x \n x text = text.ReplaceAll("([^\\n])\\r?\\n([^\\r\\n])", "$1$2"); } // Run the segmenter on each line so that we don't get tokens that cross line boundaries // Neat trick to keep delimiters from: http://stackoverflow.com/a/2206432 string[] lines = text.Split(string.Format("((?<=%1$s)|(?=%1$s))", separator)); words = new List <string>(); foreach (string line in lines) { if (separatorPattern.Matcher(line).Matches()) { // Don't segment newline tokens, keep them as-is words.Add(line); } else { Sharpen.Collections.AddAll(words, segmenter.SegmentString(line)); } } } if (Verbose) { log.Info(text + "\n--->\n" + words + " (length " + words.Count + ')'); } // Go through everything again and make the final tokens list; for loop is over segmented words int pos = 0; // This is used to index sentChars, the output from splitCharacters StringBuilder xmlBuffer = new StringBuilder(); int xmlBegin = -1; foreach (string w in words) { CoreLabel fl = sentChars[pos]; string xmlCharAnnotation = fl.Get(typeof(SegmenterCoreAnnotations.XMLCharAnnotation)); if (Verbose) { log.Info("Working on word " + w + ", sentChar " + fl.ToShorterString() + " (sentChars index " + pos + ')'); } if ("0".Equals(xmlCharAnnotation) || "beginning".Equals(xmlCharAnnotation)) { // Beginnings of plain text and other XML tags are good places to end an XML tag if (xmlBuffer.Length > 0) { // Form the XML token string xmlTag = xmlBuffer.ToString(); CoreLabel fl1 = sentChars[pos - 1]; int end = fl1.Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation)); tokens.Add(MakeXmlToken(xmlTag, true, xmlBegin, end)); // Clean up and prepare for the next XML tag xmlBegin = -1; xmlBuffer = new StringBuilder(); } } if (!"0".Equals(xmlCharAnnotation)) { // found an XML character; fl changes inside this loop! while (fl.Get(typeof(SegmenterCoreAnnotations.XMLCharAnnotation)).Equals("whitespace")) { // Print whitespaces into the XML buffer and move on until the next non-whitespace character is found // and we're in sync with segmenter output again xmlBuffer.Append(' '); pos += 1; fl = sentChars[pos]; } xmlBuffer.Append(w); pos = AdvancePos(sentChars, pos, w); if (xmlBegin < 0) { xmlBegin = fl.Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation)); } continue; } // remember that fl may be more than one char long (non-BMP chars like emoji), so use advancePos() fl.Set(typeof(CoreAnnotations.ChineseSegAnnotation), "1"); if (w.IsEmpty()) { if (Verbose) { log.Warn("Encountered an empty word. Shouldn't happen?"); } continue; } // [cdm 2016:] surely this shouldn't happen! int begin = fl.Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation)); pos = AdvancePos(sentChars, pos, w); if (pos - 1 >= sentChars.Count) { log.Error("Error: on word " + w + " at position " + (pos - w.Length) + " trying to get at position " + (pos - 1)); log.Error("last element of sentChars is " + sentChars[sentChars.Count - 1]); } else { fl = sentChars[pos - 1]; int end = fl.Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation)); tokens.Add(MakeXmlToken(w, false, begin, end)); } } // end for (go through everything again) if (xmlBuffer.Length > 0) { // Form the last XML token, if any string xmlTag = xmlBuffer.ToString(); CoreLabel fl1 = sentChars[pos - 1]; int end = fl1.Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation)); tokens.Add(MakeXmlToken(xmlTag, true, xmlBegin, end)); } if (Verbose) { foreach (CoreLabel token in tokens) { log.Info(token.ToShorterString()); } } }
/// <summary> /// Evaluate performance on a list of sentences, predicted parses, /// and gold parses. /// </summary> /// <returns>A map from metric name to metric value</returns> public virtual IDictionary <string, double> Evaluate(IList <ICoreMap> sentences, IList <DependencyTree> trees, IList <DependencyTree> goldTrees) { IDictionary <string, double> result = new Dictionary <string, double>(); // We'll skip words which are punctuation. Retrieve tags indicating // punctuation in this treebank. ICollection <string> punctuationTags = GetPunctuationTags(); if (trees.Count != goldTrees.Count) { log.Error("Incorrect number of trees."); return(null); } int correctArcs = 0; int correctArcsNoPunc = 0; int correctHeads = 0; int correctHeadsNoPunc = 0; int correctTrees = 0; int correctTreesNoPunc = 0; int correctRoot = 0; int sumArcs = 0; int sumArcsNoPunc = 0; for (int i = 0; i < trees.Count; ++i) { IList <CoreLabel> tokens = sentences[i].Get(typeof(CoreAnnotations.TokensAnnotation)); if (trees[i].n != goldTrees[i].n) { log.Error("Tree " + (i + 1) + ": incorrect number of nodes."); return(null); } if (!trees[i].IsTree()) { log.Error("Tree " + (i + 1) + ": illegal."); return(null); } int nCorrectHead = 0; int nCorrectHeadNoPunc = 0; int nNoPunc = 0; for (int j = 1; j <= trees[i].n; ++j) { if (trees[i].GetHead(j) == goldTrees[i].GetHead(j)) { ++correctHeads; ++nCorrectHead; if (trees[i].GetLabel(j).Equals(goldTrees[i].GetLabel(j))) { ++correctArcs; } } ++sumArcs; string tag = tokens[j - 1].Tag(); if (!punctuationTags.Contains(tag)) { ++sumArcsNoPunc; ++nNoPunc; if (trees[i].GetHead(j) == goldTrees[i].GetHead(j)) { ++correctHeadsNoPunc; ++nCorrectHeadNoPunc; if (trees[i].GetLabel(j).Equals(goldTrees[i].GetLabel(j))) { ++correctArcsNoPunc; } } } } if (nCorrectHead == trees[i].n) { ++correctTrees; } if (nCorrectHeadNoPunc == nNoPunc) { ++correctTreesNoPunc; } if (trees[i].GetRoot() == goldTrees[i].GetRoot()) { ++correctRoot; } } result["UAS"] = correctHeads * 100.0 / sumArcs; result["UASnoPunc"] = correctHeadsNoPunc * 100.0 / sumArcsNoPunc; result["LAS"] = correctArcs * 100.0 / sumArcs; result["LASnoPunc"] = correctArcsNoPunc * 100.0 / sumArcsNoPunc; result["UEM"] = correctTrees * 100.0 / trees.Count; result["UEMnoPunc"] = correctTreesNoPunc * 100.0 / trees.Count; result["ROOT"] = correctRoot * 100.0 / trees.Count; return(result); }
/// <param name="args"/> public static void Main(string[] args) { if (args.Length < MinArgs) { log.Info(Usage()); System.Environment.Exit(-1); } Properties options = StringUtils.ArgsToProperties(args, OptionArgDefs()); bool Verbose = PropertiesUtils.GetBool(options, "v", false); File testTreebank = options.Contains("t") ? new File(options.GetProperty("t")) : null; int maxGoldSentLen = PropertiesUtils.GetInt(options, "l", int.MaxValue); bool SerInput = PropertiesUtils.GetBool(options, "o", false); string[] parsedArgs = options.GetProperty(string.Empty, string.Empty).Split("\\s+"); if (parsedArgs.Length != MinArgs) { log.Info(Usage()); System.Environment.Exit(-1); } File trainTreebank = new File(parsedArgs[0]); DateTime startTime = new DateTime(); log.Info("###################################"); log.Info("### Joint Segmentation / Parser ###"); log.Info("###################################"); System.Console.Error.Printf("Start time: %s\n", startTime); JointParsingModel parsingModel = new JointParsingModel(); parsingModel.SetVerbose(Verbose); parsingModel.SetMaxEvalSentLen(maxGoldSentLen); parsingModel.SetSerInput(SerInput); //WSGDEBUG -- Some stuff for eclipse debugging InputStream inputStream = null; try { if (Runtime.GetProperty("eclipse") == null) { inputStream = (SerInput) ? new ObjectInputStream(new GZIPInputStream(Runtime.@in)) : Runtime.@in; } else { FileInputStream fileStream = new FileInputStream(new File("debug.2.xml")); inputStream = (SerInput) ? new ObjectInputStream(new GZIPInputStream(fileStream)) : fileStream; } } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); System.Environment.Exit(-1); } finally { if (inputStream != null) { try { inputStream.Close(); } catch (IOException) { } } } if (!trainTreebank.Exists()) { log.Info("Training treebank does not exist!\n " + trainTreebank.GetPath()); } else { if (testTreebank != null && !testTreebank.Exists()) { log.Info("Test treebank does not exist!\n " + testTreebank.GetPath()); } else { if (parsingModel.Run(trainTreebank, testTreebank, inputStream)) { log.Info("Successful shutdown!"); } else { log.Error("Parsing model failure."); } } } DateTime stopTime = new DateTime(); long elapsedTime = stopTime.GetTime() - startTime.GetTime(); log.Info(); log.Info(); System.Console.Error.Printf("Completed processing at %s\n", stopTime); System.Console.Error.Printf("Elapsed time: %d seconds\n", (int)(elapsedTime / 1000F)); }
/// <summary>TODO Can add various signatures, setting the signature via Options.</summary> /// <param name="word">The word to make a signature for</param> /// <param name="loc"> /// Its position in the sentence (mainly so sentence-initial /// capitalized words can be treated differently) /// </param> /// <returns>A String that is its signature (equivalence class)</returns> public override string GetSignature(string word, int loc) { string BaseLabel = "UNK"; StringBuilder sb = new StringBuilder(BaseLabel); switch (unknownLevel) { case 1: { if (StringUtils.IsNumeric(word)) { sb.Append('#'); break; } else { if (StringUtils.IsPunct(word)) { sb.Append('!'); break; } } // Mutually exclusive patterns sb.Append(SpanishUnknownWordSignatures.ConditionalSuffix(word)); sb.Append(SpanishUnknownWordSignatures.ImperfectSuffix(word)); sb.Append(SpanishUnknownWordSignatures.InfinitiveSuffix(word)); sb.Append(SpanishUnknownWordSignatures.AdverbSuffix(word)); // Broad coverage patterns -- only apply if we haven't yet matched at all if (sb.ToString().Equals(BaseLabel)) { if (SpanishUnknownWordSignatures.HasVerbFirstPersonPluralSuffix(word)) { sb.Append("-vb1p"); } else { if (SpanishUnknownWordSignatures.HasGerundSuffix(word)) { sb.Append("-ger"); } else { if (word.EndsWith("s")) { sb.Append("-s"); } } } } // Backoff to suffix if we haven't matched anything else if (unknownSuffixSize > 0 && sb.ToString().Equals(BaseLabel)) { int min = word.Length < unknownSuffixSize ? word.Length : unknownSuffixSize; sb.Append('-').Append(Sharpen.Runtime.Substring(word, word.Length - min)); } char first = word[0]; if ((char.IsUpperCase(first) || char.IsTitleCase(first)) && !IsUpperCase(word)) { sb.Append("-C"); } else { sb.Append("-c"); } break; } default: { log.Error(string.Format("%s: Invalid unknown word signature! (%d)%n", this.GetType().FullName, unknownLevel)); break; } } return(sb.ToString()); }
public override void Display(bool verbose, PrintWriter pw) { if (precisions.KeySet().Count != recalls.KeySet().Count) { log.Error("Different counts for precisions and recalls!"); return; } ICollection <ILabel> cats = GetEvalLabelSet(precisions.KeySet()); Random rand = new Random(); IDictionary <double, ILabel> f1Map = new SortedDictionary <double, ILabel>(); foreach (ILabel cat in cats) { double pnum2 = pnums2.GetCount(cat); double rnum2 = rnums2.GetCount(cat); double prec = precisions2.GetCount(cat) / pnum2; double rec = recalls2.GetCount(cat) / rnum2; double f1 = 2.0 / (1.0 / prec + 1.0 / rec); if (f1.Equals(double.NaN)) { f1 = -1.0; } if (f1Map.Contains(f1)) { f1Map[f1 + (rand.NextDouble() / 1000.0)] = cat; } else { f1Map[f1] = cat; } } pw.Println("============================================================"); pw.Println("Labeled Bracketed Evaluation by Category -- final statistics"); pw.Println("============================================================"); // Per category double catPrecisions = 0.0; double catPrecisionNums = 0.0; double catRecalls = 0.0; double catRecallNums = 0.0; foreach (ILabel cat_1 in f1Map.Values) { double pnum2 = pnums2.GetCount(cat_1); double rnum2 = rnums2.GetCount(cat_1); double prec = precisions2.GetCount(cat_1) / pnum2; prec *= 100.0; double rec = recalls2.GetCount(cat_1) / rnum2; rec *= 100.0; double f1 = 2.0 / (1.0 / prec + 1.0 / rec); catPrecisions += precisions2.GetCount(cat_1); catPrecisionNums += pnum2; catRecalls += recalls2.GetCount(cat_1); catRecallNums += rnum2; string Lp = pnum2 == 0.0 ? "N/A" : string.Format("%.2f", prec); string Lr = rnum2 == 0.0 ? "N/A" : string.Format("%.2f", rec); string F1 = (pnum2 == 0.0 || rnum2 == 0.0) ? "N/A" : string.Format("%.2f", f1); pw.Printf("%s\tLP: %s\tguessed: %d\tLR: %s\tgold: %d\t F1: %s%n", cat_1.Value(), Lp, (int)pnum2, Lr, (int)rnum2, F1); } pw.Println("============================================================"); // Totals double prec_1 = catPrecisions / catPrecisionNums; double rec_1 = catRecalls / catRecallNums; double f1_1 = (2 * prec_1 * rec_1) / (prec_1 + rec_1); pw.Printf("Total\tLP: %.2f\tguessed: %d\tLR: %.2f\tgold: %d\t F1: %.2f%n", prec_1 * 100.0, (int)catPrecisionNums, rec_1 * 100.0, (int)catRecallNums, f1_1 * 100.0); pw.Println("============================================================"); }