public virtual void Build() { LineNumberReader infile = null; PrintWriter outfile = null; string currentInfile = string.Empty; try { outfile = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outFileName), "UTF-8"))); foreach (File path in pathsToData) { infile = new LineNumberReader(new BufferedReader(new InputStreamReader(new FileInputStream(path), "UTF-8"))); currentInfile = path.GetPath(); while (infile.Ready()) { List <Word> sent = SentenceUtils.ToUntaggedList(infile.ReadLine().Split("\\s+")); foreach (Word token in sent) { Matcher hasArabic = utf8ArabicChart.Matcher(token.Word()); if (hasArabic.Find()) { token.SetWord(escaper.Apply(token.Word())); token.SetWord(lexMapper.Map(null, token.Word())); } } outfile.Println(SentenceUtils.ListToString(sent)); } toStringBuffer.Append(string.Format(" Read %d input lines from %s", infile.GetLineNumber(), path.GetPath())); } infile.Close(); } catch (UnsupportedEncodingException e) { System.Console.Error.Printf("%s: Filesystem does not support UTF-8 output\n", this.GetType().FullName); Sharpen.Runtime.PrintStackTrace(e); } catch (FileNotFoundException) { System.Console.Error.Printf("%s: Could not open %s for writing\n", this.GetType().FullName, outFileName); } catch (IOException) { System.Console.Error.Printf("%s: Error reading from %s (line %d)\n", this.GetType().FullName, currentInfile, infile.GetLineNumber()); } catch (Exception e) { System.Console.Error.Printf("%s: Input sentence from %s contains token mapped to null (line %d)\n", this.GetType().FullName, currentInfile, infile.GetLineNumber()); Sharpen.Runtime.PrintStackTrace(e); } finally { if (outfile != null) { outfile.Close(); } } }
/// <summary>Tags the sentence s by running maxent model.</summary> /// <remarks> /// Tags the sentence s by running maxent model. Returns a sentence (List) of /// TaggedWord objects. /// </remarks> /// <param name="s">Input sentence (List). This isn't changed.</param> /// <returns>Tagged sentence</returns> public virtual List <TaggedWord> TagSentence <_T0>(IList <_T0> s, bool reuseTags) where _T0 : IHasWord { this.origWords = new List <IHasWord>(s); int sz = s.Count; this.sent = new List <string>(sz + 1); foreach (IHasWord value1 in s) { if (maxentTagger.wordFunction != null) { sent.Add(maxentTagger.wordFunction.Apply(value1.Word())); } else { sent.Add(value1.Word()); } } sent.Add(Edu.Stanford.Nlp.Tagger.Common.Tagger.EosWord); if (reuseTags) { this.originalTags = new List <string>(sz + 1); foreach (IHasWord value in s) { if (value is IHasTag) { originalTags.Add(((IHasTag)value).Tag()); } else { originalTags.Add(null); } } originalTags.Add(Edu.Stanford.Nlp.Tagger.Common.Tagger.EosTag); } size = sz + 1; if (Verbose) { log.Info("Sentence is " + SentenceUtils.ListToString(sent, false, tagSeparator)); } Init(); List <TaggedWord> result = TestTagInference(); if (maxentTagger.wordFunction != null) { for (int j = 0; j < sz; ++j) { result[j].SetWord(s[j].Word()); } } return(result); }
/// <summary>Reads an annotation from the given filename using the requested input.</summary> public static IList <Annotation> GetAnnotations(StanfordCoreNLP tokenizer, SentimentPipeline.Input inputFormat, string filename, bool filterUnknown) { switch (inputFormat) { case SentimentPipeline.Input.Text: { string text = IOUtils.SlurpFileNoExceptions(filename); Annotation annotation = new Annotation(text); tokenizer.Annotate(annotation); IList <Annotation> annotations = Generics.NewArrayList(); foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation))) { Annotation nextAnnotation = new Annotation(sentence.Get(typeof(CoreAnnotations.TextAnnotation))); nextAnnotation.Set(typeof(CoreAnnotations.SentencesAnnotation), Java.Util.Collections.SingletonList(sentence)); annotations.Add(nextAnnotation); } return(annotations); } case SentimentPipeline.Input.Trees: { IList <Tree> trees; if (filterUnknown) { trees = SentimentUtils.ReadTreesWithGoldLabels(filename); trees = SentimentUtils.FilterUnknownRoots(trees); } else { MemoryTreebank treebank = new MemoryTreebank("utf-8"); treebank.LoadPath(filename, null); trees = new List <Tree>(treebank); } IList <Annotation> annotations = Generics.NewArrayList(); foreach (Tree tree in trees) { ICoreMap sentence = new Annotation(SentenceUtils.ListToString(tree.Yield())); sentence.Set(typeof(TreeCoreAnnotations.TreeAnnotation), tree); IList <ICoreMap> sentences = Java.Util.Collections.SingletonList(sentence); Annotation annotation = new Annotation(string.Empty); annotation.Set(typeof(CoreAnnotations.SentencesAnnotation), sentences); annotations.Add(annotation); } return(annotations); } default: { throw new ArgumentException("Unknown format " + inputFormat); } } }
/// <summary>Converts a parse tree into a string of tokens.</summary> /// <remarks> /// Converts a parse tree into a string of tokens. Each token is a word and /// its POS tag separated by the delimiter specified by <code>separator</code> /// </remarks> /// <param name="t">- A parse tree</param> /// <param name="removeEscaping">- If true, remove LDC escape characters. Otherwise, leave them.</param> /// <param name="separator">Word/tag separator</param> /// <returns>A string of tagged words</returns> public static string TaggedStringFromTree(Tree t, bool removeEscaping, string separator) { t = t.Prune(emptyFilter, tf); IList <CoreLabel> taggedSentence = t.TaggedLabeledYield(); foreach (CoreLabel token in taggedSentence) { string word = (removeEscaping) ? UnEscape(token.Word()) : token.Word(); token.SetWord(word); token.SetValue(word); } return(SentenceUtils.ListToString(taggedSentence, false, separator)); }
private static void CompareXMLResults(string input, string element, params string[] expectedResults) { List <string> results = new List <string>(); DocumentPreprocessor document = new DocumentPreprocessor(new BufferedReader(new StringReader(input)), DocumentPreprocessor.DocType.Xml); document.SetElementDelimiter(element); foreach (IList <IHasWord> sentence in document) { results.Add(SentenceUtils.ListToString(sentence)); } NUnit.Framework.Assert.AreEqual(expectedResults.Length, results.Count); for (int i = 0; i < results.Count; ++i) { NUnit.Framework.Assert.AreEqual(expectedResults[i], results[i]); } }
/// <exception cref="System.Exception"/> public static void Main(string[] args) { if (args.Length != 2) { log.Info("usage: java TaggerDemo modelFile fileToTag"); return; } MaxentTagger tagger = new MaxentTagger(args[0]); IList <IList <IHasWord> > sentences = MaxentTagger.TokenizeText(new BufferedReader(new FileReader(args[1]))); foreach (IList <IHasWord> sentence in sentences) { IList <TaggedWord> tSentence = tagger.TagSentence(sentence); System.Console.Out.WriteLine(SentenceUtils.ListToString(tSentence, false)); } }
private Tree PostProcessMWE(Tree t) { string tYield = SentenceUtils.ListToString(t.Yield()).ReplaceAll("\\s+", string.Empty); if (tYield.Matches("[\\d\\p{Punct}]*")) { IList <Tree> kids = new List <Tree>(); kids.Add(treeFactory.NewLeaf(tYield)); t = treeFactory.NewTreeNode(t.Value(), kids); } else { t.SetValue(MwePhrasal + t.Value()); } return(t); }
// static methods public static void SetLabels(Tree tree, IDictionary <string, string> labelMap, ParseAndSetLabels.MissingLabels missing, string defaultLabel, ICollection <string> unknowns) { if (tree.IsLeaf()) { return; } string text = SentenceUtils.ListToString(tree.Yield()); string label = labelMap[text]; if (label != null) { tree.Label().SetValue(label); } else { switch (missing) { case ParseAndSetLabels.MissingLabels.Fail: { throw new Exception("No label for '" + text + "'"); } case ParseAndSetLabels.MissingLabels.Default: { tree.Label().SetValue(defaultLabel); unknowns.Add(text); break; } case ParseAndSetLabels.MissingLabels.KeepOriginal: { // do nothing break; } default: { throw new ArgumentException("Unknown MissingLabels mode " + missing); } } } foreach (Tree child in tree.Children()) { SetLabels(child, labelMap, missing, defaultLabel, unknowns); } }
public override void Evaluate(Tree guess, Tree gold, PrintWriter pw) { if (gold == null || guess == null) { System.Console.Error.Printf("%s: Cannot compare against a null gold or guess tree!\n", this.GetType().FullName); return; } else { if (guess.Yield().Count != gold.Yield().Count) { log.Info("Warning: yield differs:"); log.Info("Guess: " + SentenceUtils.ListToString(guess.Yield())); log.Info("Gold: " + SentenceUtils.ListToString(gold.Yield())); } } base.Evaluate(guess, gold, pw); }
public virtual void TestSpanishDatelineSeparation() { Properties props = PropertiesUtils.AsProperties("annotators", "tokenize, cleanxml, ssplit", "tokenize.language", "es", "tokenize.options", "tokenizeNLs,ptb3Escaping=true", "ssplit.newlineIsSentenceBreak", "two", "ssplit.boundaryMultiTokenRegex" , "/\\*NL\\*/ /\\p{Lu}[-\\p{L}]+/+ ( /,/ /[-\\p{L}]+/+ )? " + "( /,/ /[1-3]?[0-9]/ /\\p{Ll}{3,3}/ )? /=LRB=/ /\\p{Lu}\\p{L}+/ /=RRB=/ /--/"); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); NUnit.Framework.Assert.AreEqual(dateLineSpanishTexts.Length, dateLineSpanishTokens.Length, "Bad test data"); for (int i = 0; i < dateLineSpanishTexts.Length; i++) { Annotation document1 = new Annotation(dateLineSpanishTexts[i]); pipeline.Annotate(document1); IList <ICoreMap> sentences = document1.Get(typeof(CoreAnnotations.SentencesAnnotation)); NUnit.Framework.Assert.AreEqual(2, sentences.Count, "For " + dateLineSpanishTexts[i] + " annotation is " + document1); IList <CoreLabel> sentenceOneTokens = sentences[0].Get(typeof(CoreAnnotations.TokensAnnotation)); string sentenceOne = SentenceUtils.ListToString(sentenceOneTokens); NUnit.Framework.Assert.AreEqual(dateLineSpanishTokens[i], sentenceOne, "Bad tokens in dateline"); } }
public virtual void TestArabicTokenizer() { System.Diagnostics.Debug.Assert((untokInputs.Length == tokReferences.Length)); ITokenizerFactory <CoreLabel> tf = ArabicTokenizer.AtbFactory(); tf.SetOptions("removeProMarker"); tf.SetOptions("removeSegMarker"); tf.SetOptions("removeMorphMarker"); for (int i = 0; i < untokInputs.Length; ++i) { string line = untokInputs[i]; ITokenizer <CoreLabel> tokenizer = tf.GetTokenizer(new StringReader(line)); IList <CoreLabel> tokens = tokenizer.Tokenize(); string tokenizedLine = SentenceUtils.ListToString(tokens); string reference = tokReferences[i]; NUnit.Framework.Assert.AreEqual("Tokenization deviates from reference", reference, tokenizedLine); } }
public static void Main(string[] args) { Properties config = StringUtils.ArgsToProperties(args); log.Info(config); bool fullSentence = PropertiesUtils.GetBool(config, "fullSentence", false); Random random = new Random(); string tagSeparator = config.GetProperty("tagSeparator", TaggerConfig.TagSeparator); TaggedFileRecord record = TaggedFileRecord.CreateRecord(config, config.GetProperty("input")); foreach (IList <TaggedWord> sentence in record.Reader()) { int len = random.NextInt(sentence.Count) + 1; System.Console.Out.WriteLine(SentenceUtils.ListToString(sentence.SubList(0, len), false, tagSeparator)); if (fullSentence) { System.Console.Out.WriteLine(SentenceUtils.ListToString(sentence, false, tagSeparator)); } } }
public virtual void TestTwoNewlineIsSentenceBreakTokenizeNLs() { string text = "This is \none sentence\n\nThis is not another."; Properties props = PropertiesUtils.AsProperties("annotators", "tokenize, ssplit", "tokenize.language", "en", "tokenize.options", "tokenizeNLs,invertible,ptb3Escaping=true", "ssplit.newlineIsSentenceBreak", "two"); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); Annotation document1 = new Annotation(text); pipeline.Annotate(document1); IList <ICoreMap> sentences = document1.Get(typeof(CoreAnnotations.SentencesAnnotation)); NUnit.Framework.Assert.AreEqual(2, sentences.Count); // make sure that there are the correct # of tokens (does contain NL tokens) IList <CoreLabel> tokens = document1.Get(typeof(CoreAnnotations.TokensAnnotation)); NUnit.Framework.Assert.AreEqual(9, tokens.Count); IList <CoreLabel> sentenceTwoTokens = sentences[1].Get(typeof(CoreAnnotations.TokensAnnotation)); string sentenceTwo = SentenceUtils.ListToString(sentenceTwoTokens); NUnit.Framework.Assert.AreEqual("This is not another .", sentenceTwo, "Bad tokens in sentence"); }
public virtual void TestKbpSpanishWorks() { Properties props = PropertiesUtils.AsProperties("annotators", "tokenize, cleanxml, ssplit", "tokenize.language", "es", "tokenize.options", "tokenizeNLs,ptb3Escaping=true", "ssplit.newlineIsSentenceBreak", "two", "ssplit.tokenPatternsToDiscard" , "\\n,\\*NL\\*", "ssplit.boundaryMultiTokenRegex", "/\\*NL\\*/ /\\p{Lu}[-\\p{L}]+/+ /,/ ( /[-\\p{L}]+/+ /,/ )? " + "/[1-3]?[0-9]/ /\\p{Ll}{3,5}/ /=LRB=/ /\\p{Lu}\\p{L}+/ /=RRB=/ /--/", "clean.xmltags", "headline|text|post", "clean.singlesentencetags" , "HEADLINE|AUTHOR", "clean.sentenceendingtags", "TEXT|POST|QUOTE", "clean.turntags", "POST|QUOTE", "clean.speakertags", "AUTHOR", "clean.datetags", "DATE_TIME", "clean.doctypetags", "DOC", "clean.docAnnotations", "docID=doc[id]", "clean.sectiontags" , "HEADLINE|POST", "clean.sectionAnnotations", "sectionID=post[id],sectionDate=post[datetime],author=post[author]", "clean.quotetags", "quote", "clean.quoteauthorattributes", "orig_author", "clean.tokenAnnotations", "link=a[href],speaker=post[author],speaker=quote[orig_author]" ); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); Annotation document1 = new Annotation(kbpSpanishDocument); pipeline.Annotate(document1); IList <ICoreMap> sentences = document1.Get(typeof(CoreAnnotations.SentencesAnnotation)); for (int i = 0; i < Math.Min(kbpSpanishSentences.Length, sentences.Count); i++) { ICoreMap sentence = sentences[i]; string sentenceText = SentenceUtils.ListToString(sentence.Get(typeof(CoreAnnotations.TokensAnnotation))); NUnit.Framework.Assert.AreEqual(kbpSpanishSentences[i], sentenceText, "Bad sentence #" + i); } NUnit.Framework.Assert.AreEqual(kbpSpanishSentences.Length, sentences.Count, "Bad total number of sentences"); }
public override string ToString() { StringBuilder result = new StringBuilder(); for (int prediction = 0; prediction < numClasses; ++prediction) { result.Append("Best scores for class " + prediction + "\n"); IDictionary <int, PriorityQueue <Tree> > ngrams = classToNGrams[prediction]; foreach (KeyValuePair <int, PriorityQueue <Tree> > entry in ngrams) { IList <Tree> trees = Generics.NewArrayList(entry.Value); trees.Sort(ScoreComparator(prediction)); result.Append(" Len " + entry.Key + "\n"); for (int i = trees.Count - 1; i >= 0; i--) { Tree tree = trees[i]; result.Append(" " + SentenceUtils.ListToString(tree.Yield()) + " [" + RNNCoreAnnotations.GetPredictions(tree).Get(prediction) + "]\n"); } } } return(result.ToString()); }
public virtual void TestDatelineSeparation() { Properties props = PropertiesUtils.AsProperties("annotators", "tokenize, cleanxml, ssplit", "tokenize.language", "en", "ssplit.newlineIsSentenceBreak", "two", "ssplit.boundaryMultiTokenRegex", "( /\\*NL\\*/ /\\p{Lu}[-\\p{L}]+/+ /,/ ( /[-\\p{L}]+/+ /,/ )? " + "/\\p{Lu}\\p{Ll}{2,5}\\.?/ /[1-3]?[0-9]/ /-LRB-/ /\\p{Lu}\\p{L}+/ /-RRB-/ /--/ | " + "/\\*NL\\*/ /\\p{Lu}[-\\p{Lu}]+/+ ( /,/ /[-\\p{L}]+/+ )? /-/ )"); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); NUnit.Framework.Assert.AreEqual(dateLineTexts.Length, dateLineTokens.Length, "Bad test data"); for (int i = 0; i < dateLineTexts.Length; i++) { Annotation document1 = new Annotation(dateLineTexts[i]); pipeline.Annotate(document1); IList <ICoreMap> sentences = document1.Get(typeof(CoreAnnotations.SentencesAnnotation)); // for (CoreMap sentence : sentences) { // String sentenceText = SentenceUtils.listToString(sentence.get(CoreAnnotations.TokensAnnotation.class)); // System.err.println(sentenceText); // } NUnit.Framework.Assert.AreEqual(2, sentences.Count, "For " + dateLineTexts[i] + " annotation is " + document1); IList <CoreLabel> sentenceOneTokens = sentences[0].Get(typeof(CoreAnnotations.TokensAnnotation)); string sentenceOne = SentenceUtils.ListToString(sentenceOneTokens); NUnit.Framework.Assert.AreEqual(dateLineTokens[i], sentenceOne, "Bad tokens in dateline"); } }
public virtual IList <IHasWord> Segment(string s) { BuildSegmentationLattice(s); List <Word> sent = MaxMatchSegmentation(); PrintlnErr("raw output: " + SentenceUtils.ListToString(sent)); List <Word> postProcessedSent = PostProcessSentence(sent); PrintlnErr("processed output: " + SentenceUtils.ListToString(postProcessedSent)); ChineseStringUtils.CTPPostProcessor postProcessor = new ChineseStringUtils.CTPPostProcessor(); string postSentString = postProcessor.PostProcessingAnswer(postProcessedSent.ToString(), false); PrintlnErr("Sighan2005 output: " + postSentString); string[] postSentArray = postSentString.Split("\\s+"); List <Word> postSent = new List <Word>(); foreach (string w in postSentArray) { postSent.Add(new Word(w)); } return(new List <IHasWord>(postSent)); }
public static void CountMWEStatistics(Tree t, TwoDimensionalCounter <string, string> unigramTagger, TwoDimensionalCounter <string, string> labelPreterm, TwoDimensionalCounter <string, string> pretermLabel, TwoDimensionalCounter <string, string> labelTerm, TwoDimensionalCounter <string, string> termLabel) { UpdateTagger(unigramTagger, t); //Count MWE statistics TregexMatcher m = pMWE.Matcher(t); while (m.FindNextMatchingNode()) { Tree match = m.GetMatch(); string label = match.Value(); if (ResolveDummyTags && label.Equals(FrenchXMLTreeReader.MissingPhrasal)) { continue; } string preterm = SentenceUtils.ListToString(match.PreTerminalYield()); string term = SentenceUtils.ListToString(match.Yield()); labelPreterm.IncrementCount(label, preterm); pretermLabel.IncrementCount(preterm, label); labelTerm.IncrementCount(label, term); termLabel.IncrementCount(term, label); } }
private static void RunTest(string input, string[] expected, string[] sentenceFinalPuncWords, bool whitespaceTokenize) { IList <string> results = new List <string>(); DocumentPreprocessor document = new DocumentPreprocessor(new BufferedReader(new StringReader(input))); if (sentenceFinalPuncWords != null) { document.SetSentenceFinalPuncWords(sentenceFinalPuncWords); } if (whitespaceTokenize) { document.SetTokenizerFactory(null); document.SetSentenceDelimiter("\n"); } foreach (IList <IHasWord> sentence in document) { results.Add(SentenceUtils.ListToString(sentence)); } NUnit.Framework.Assert.AreEqual("Should be " + expected.Length + " sentences but got " + results.Count + ": " + results, expected.Length, results.Count); for (int i = 0; i < results.Count; ++i) { NUnit.Framework.Assert.AreEqual("Failed on sentence " + i, expected[i], results[i]); } }
public virtual void TestAlwaysNewlineIsSentenceBreakSettings() { string text = "This is \none sentence\n\nThis is not another."; string[] sents = new string[] { "This is", "one sentence", "This is not another ." }; Properties props = PropertiesUtils.AsProperties("annotators", "tokenize, ssplit", "ssplit.newlineIsSentenceBreak", "always"); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); Annotation document1 = new Annotation(text); pipeline.Annotate(document1); IList <ICoreMap> sentences = document1.Get(typeof(CoreAnnotations.SentencesAnnotation)); NUnit.Framework.Assert.AreEqual(3, sentences.Count); // make sure that there are the correct # of tokens (count does contain NL tokens) IList <CoreLabel> tokens = document1.Get(typeof(CoreAnnotations.TokensAnnotation)); NUnit.Framework.Assert.AreEqual(9, tokens.Count); for (int i = 0; i < Math.Min(sents.Length, sentences.Count); i++) { ICoreMap sentence = sentences[i]; string sentenceText = SentenceUtils.ListToString(sentence.Get(typeof(CoreAnnotations.TokensAnnotation))); NUnit.Framework.Assert.AreEqual(sents[i], sentenceText, "Bad sentence #" + i); } }
/// <exception cref="System.IO.IOException"/> public static void Main(string[] args) { string modelPath = null; string outputPath = null; string inputPath = null; string testTreebankPath = null; IFileFilter testTreebankFilter = null; IList <string> unusedArgs = Generics.NewArrayList(); for (int argIndex = 0; argIndex < args.Length;) { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-model")) { modelPath = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-output")) { outputPath = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-input")) { inputPath = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-testTreebank")) { Pair <string, IFileFilter> treebankDescription = ArgUtils.GetTreebankDescription(args, argIndex, "-testTreebank"); argIndex = argIndex + ArgUtils.NumSubArgs(args, argIndex) + 1; testTreebankPath = treebankDescription.First(); testTreebankFilter = treebankDescription.Second(); } else { unusedArgs.Add(args[argIndex++]); } } } } } string[] newArgs = Sharpen.Collections.ToArray(unusedArgs, new string[unusedArgs.Count]); LexicalizedParser parser = ((LexicalizedParser)LexicalizedParser.LoadModel(modelPath, newArgs)); DVModel model = DVParser.GetModelFromLexicalizedParser(parser); File outputFile = new File(outputPath); FileSystem.CheckNotExistsOrFail(outputFile); FileSystem.MkdirOrFail(outputFile); int count = 0; if (inputPath != null) { Reader input = new BufferedReader(new FileReader(inputPath)); DocumentPreprocessor processor = new DocumentPreprocessor(input); foreach (IList <IHasWord> sentence in processor) { count++; // index from 1 IParserQuery pq = parser.ParserQuery(); if (!(pq is RerankingParserQuery)) { throw new ArgumentException("Expected a RerankingParserQuery"); } RerankingParserQuery rpq = (RerankingParserQuery)pq; if (!rpq.Parse(sentence)) { throw new Exception("Unparsable sentence: " + sentence); } IRerankerQuery reranker = rpq.RerankerQuery(); if (!(reranker is DVModelReranker.Query)) { throw new ArgumentException("Expected a DVModelReranker"); } DeepTree deepTree = ((DVModelReranker.Query)reranker).GetDeepTrees()[0]; IdentityHashMap <Tree, SimpleMatrix> vectors = deepTree.GetVectors(); foreach (KeyValuePair <Tree, SimpleMatrix> entry in vectors) { log.Info(entry.Key + " " + entry.Value); } FileWriter fout = new FileWriter(outputPath + File.separator + "sentence" + count + ".txt"); BufferedWriter bout = new BufferedWriter(fout); bout.Write(SentenceUtils.ListToString(sentence)); bout.NewLine(); bout.Write(deepTree.GetTree().ToString()); bout.NewLine(); foreach (IHasWord word in sentence) { OutputMatrix(bout, model.GetWordVector(word.Word())); } Tree rootTree = FindRootTree(vectors); OutputTreeMatrices(bout, rootTree, vectors); bout.Flush(); fout.Close(); } } }
/// <summary>Test the parser on a treebank.</summary> /// <remarks> /// Test the parser on a treebank. Parses will be written to stdout, and /// various other information will be written to stderr and stdout, /// particularly if <code>op.testOptions.verbose</code> is true. /// </remarks> /// <param name="testTreebank">The treebank to parse</param> /// <returns> /// The labeled precision/recall F<sub>1</sub> (EVALB measure) /// of the parser on the treebank. /// </returns> public virtual double TestOnTreebank(Treebank testTreebank) { log.Info("Testing on treebank"); Timing treebankTotalTimer = new Timing(); TreePrint treePrint = op.testOptions.TreePrint(op.tlpParams); ITreebankLangParserParams tlpParams = op.tlpParams; ITreebankLanguagePack tlp = op.Langpack(); PrintWriter pwOut; PrintWriter pwErr; if (op.testOptions.quietEvaluation) { NullOutputStream quiet = new NullOutputStream(); pwOut = tlpParams.Pw(quiet); pwErr = tlpParams.Pw(quiet); } else { pwOut = tlpParams.Pw(); pwErr = tlpParams.Pw(System.Console.Error); } if (op.testOptions.verbose) { pwErr.Print("Testing "); pwErr.Println(testTreebank.TextualSummary(tlp)); } if (op.testOptions.evalb) { EvalbFormatWriter.InitEVALBfiles(tlpParams); } PrintWriter pwFileOut = null; if (op.testOptions.writeOutputFiles) { string fname = op.testOptions.outputFilesPrefix + "." + op.testOptions.outputFilesExtension; try { pwFileOut = op.tlpParams.Pw(new FileOutputStream(fname)); } catch (IOException ioe) { Sharpen.Runtime.PrintStackTrace(ioe); } } PrintWriter pwStats = null; if (op.testOptions.outputkBestEquivocation != null) { try { pwStats = op.tlpParams.Pw(new FileOutputStream(op.testOptions.outputkBestEquivocation)); } catch (IOException ioe) { Sharpen.Runtime.PrintStackTrace(ioe); } } if (op.testOptions.testingThreads != 1) { MulticoreWrapper <IList <IHasWord>, IParserQuery> wrapper = new MulticoreWrapper <IList <IHasWord>, IParserQuery>(op.testOptions.testingThreads, new ParsingThreadsafeProcessor(pqFactory, pwErr)); LinkedList <Tree> goldTrees = new LinkedList <Tree>(); foreach (Tree goldTree in testTreebank) { IList <IHasWord> sentence = GetInputSentence(goldTree); goldTrees.Add(goldTree); pwErr.Println("Parsing [len. " + sentence.Count + "]: " + SentenceUtils.ListToString(sentence)); wrapper.Put(sentence); while (wrapper.Peek()) { IParserQuery pq = wrapper.Poll(); goldTree = goldTrees.Poll(); ProcessResults(pq, goldTree, pwErr, pwOut, pwFileOut, pwStats, treePrint); } } // for tree iterator wrapper.Join(); while (wrapper.Peek()) { IParserQuery pq = wrapper.Poll(); Tree goldTree_1 = goldTrees.Poll(); ProcessResults(pq, goldTree_1, pwErr, pwOut, pwFileOut, pwStats, treePrint); } } else { IParserQuery pq = pqFactory.ParserQuery(); foreach (Tree goldTree in testTreebank) { IList <CoreLabel> sentence = GetInputSentence(goldTree); pwErr.Println("Parsing [len. " + sentence.Count + "]: " + SentenceUtils.ListToString(sentence)); pq.ParseAndReport(sentence, pwErr); ProcessResults(pq, goldTree, pwErr, pwOut, pwFileOut, pwStats, treePrint); } } // for tree iterator //Done parsing...print the results of the evaluations treebankTotalTimer.Done("Testing on treebank"); if (op.testOptions.quietEvaluation) { pwErr = tlpParams.Pw(System.Console.Error); } if (saidMemMessage) { ParserUtils.PrintOutOfMemory(pwErr); } if (op.testOptions.evalb) { EvalbFormatWriter.CloseEVALBfiles(); } if (numSkippedEvals != 0) { pwErr.Printf("Unable to evaluate %d parser hypotheses due to yield mismatch\n", numSkippedEvals); } // only created here so we know what parser types are supported... IParserQuery pq_1 = pqFactory.ParserQuery(); if (summary) { if (pcfgLB != null) { pcfgLB.Display(false, pwErr); } if (pcfgChildSpecific != null) { pcfgChildSpecific.Display(false, pwErr); } if (pcfgLA != null) { pcfgLA.Display(false, pwErr); } if (pcfgCB != null) { pcfgCB.Display(false, pwErr); } if (pcfgDA != null) { pcfgDA.Display(false, pwErr); } if (pcfgTA != null) { pcfgTA.Display(false, pwErr); } if (pcfgLL != null && pq_1.GetPCFGParser() != null) { pcfgLL.Display(false, pwErr); } if (depDA != null) { depDA.Display(false, pwErr); } if (depTA != null) { depTA.Display(false, pwErr); } if (depLL != null && pq_1.GetDependencyParser() != null) { depLL.Display(false, pwErr); } if (factLB != null) { factLB.Display(false, pwErr); } if (factChildSpecific != null) { factChildSpecific.Display(false, pwErr); } if (factLA != null) { factLA.Display(false, pwErr); } if (factCB != null) { factCB.Display(false, pwErr); } if (factDA != null) { factDA.Display(false, pwErr); } if (factTA != null) { factTA.Display(false, pwErr); } if (factLL != null && pq_1.GetFactoredParser() != null) { factLL.Display(false, pwErr); } if (pcfgCatE != null) { pcfgCatE.Display(false, pwErr); } foreach (IEval eval in evals) { eval.Display(false, pwErr); } foreach (BestOfTopKEval eval_1 in topKEvals) { eval_1.Display(false, pwErr); } } // these ones only have a display mode, so display if turned on!! if (pcfgRUO != null) { pcfgRUO.Display(true, pwErr); } if (pcfgCUO != null) { pcfgCUO.Display(true, pwErr); } if (tsv) { NumberFormat nf = new DecimalFormat("0.00"); pwErr.Println("factF1\tfactDA\tfactEx\tpcfgF1\tdepDA\tfactTA\tnum"); if (factLB != null) { pwErr.Print(nf.Format(factLB.GetEvalbF1Percent())); } pwErr.Print("\t"); if (pq_1.GetDependencyParser() != null && factDA != null) { pwErr.Print(nf.Format(factDA.GetEvalbF1Percent())); } pwErr.Print("\t"); if (factLB != null) { pwErr.Print(nf.Format(factLB.GetExactPercent())); } pwErr.Print("\t"); if (pcfgLB != null) { pwErr.Print(nf.Format(pcfgLB.GetEvalbF1Percent())); } pwErr.Print("\t"); if (pq_1.GetDependencyParser() != null && depDA != null) { pwErr.Print(nf.Format(depDA.GetEvalbF1Percent())); } pwErr.Print("\t"); if (pq_1.GetPCFGParser() != null && factTA != null) { pwErr.Print(nf.Format(factTA.GetEvalbF1Percent())); } pwErr.Print("\t"); if (factLB != null) { pwErr.Print(factLB.GetNum()); } pwErr.Println(); } double f1 = 0.0; if (factLB != null) { f1 = factLB.GetEvalbF1(); } //Close files (if necessary) if (pwFileOut != null) { pwFileOut.Close(); } if (pwStats != null) { pwStats.Close(); } if (parserQueryEvals != null) { foreach (IParserQueryEval parserQueryEval in parserQueryEvals) { parserQueryEval.Display(false, pwErr); } } return(f1); }
public static void Main(string[] args) { Properties props = StringUtils.ArgsToProperties(args); // logger.debug(props.toString()); SeqClassifierFlags flags = new SeqClassifierFlags(props); MaxMatchSegmenter seg = new MaxMatchSegmenter(); string lexiconFile = props.GetProperty("lexicon"); if (lexiconFile != null) { seg.AddLexicon(lexiconFile); } else { logger.Error("Error: no lexicon file!"); System.Environment.Exit(1); } Sighan2005DocumentReaderAndWriter sighanRW = new Sighan2005DocumentReaderAndWriter(); sighanRW.Init(flags); BufferedReader br = new BufferedReader(new InputStreamReader(Runtime.@in)); PrintWriter stdoutW = new PrintWriter(System.Console.Out); int lineNb = 0; for (; ;) { ++lineNb; logger.Info("line: " + lineNb); try { string line = br.ReadLine(); if (line == null) { break; } string outputLine = null; if (props.GetProperty("greedy") != null) { List <Word> sentence = seg.GreedilySegmentWords(line); outputLine = SentenceUtils.ListToString(sentence); } else { if (props.GetProperty("maxwords") != null) { seg.BuildSegmentationLattice(line); outputLine = SentenceUtils.ListToString(seg.SegmentWords(MaxMatchSegmenter.MatchHeuristic.Maxwords)); } else { seg.BuildSegmentationLattice(line); outputLine = SentenceUtils.ListToString(seg.MaxMatchSegmentation()); } } StringReader strR = new StringReader(outputLine); IEnumerator <IList <CoreLabel> > itr = sighanRW.GetIterator(strR); while (itr.MoveNext()) { sighanRW.PrintAnswers(itr.Current, stdoutW); } } catch (IOException) { // System.out.println(outputLine); break; } } stdoutW.Flush(); }
private ICoreMap DoOneSentence(ICoreMap sentence) { IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)); IList <TaggedWord> tagged = null; if (tokens.Count <= maxSentenceLength) { try { tagged = pos.TagSentence(tokens, this.reuseTags); } catch (OutOfMemoryException e) { log.Error(e); // Beware that we can now get an OOM in logging, too. log.Warn("Tagging of sentence ran out of memory. " + "Will ignore and continue: " + SentenceUtils.ListToString(tokens)); } } if (tagged != null) { for (int i = 0; i < sz; i++) { tokens[i].Set(typeof(CoreAnnotations.PartOfSpeechAnnotation), tagged[i].Tag()); } } else { foreach (CoreLabel token in tokens) { token.Set(typeof(CoreAnnotations.PartOfSpeechAnnotation), "X"); } } return(sentence); }
// main method only /// <exception cref="System.IO.IOException"/> public static void Main(string[] args) { string outputFilename = string.Empty; string tagSeparator = string.Empty; string treeRange = string.Empty; string inputEncoding = "UTF-8"; string outputEncoding = "UTF-8"; string treeFilter = string.Empty; bool noTags = false; bool noSpaces = false; IList <string> inputFilenames = new List <string>(); for (int i = 0; i < args.Length; ++i) { if ((Sharpen.Runtime.EqualsIgnoreCase(args[i], "-output") || Sharpen.Runtime.EqualsIgnoreCase(args[i], "--output")) && (i + 1 < args.Length)) { outputFilename = args[i + 1]; i++; } else { if ((Sharpen.Runtime.EqualsIgnoreCase(args[i], "-tagSeparator") || Sharpen.Runtime.EqualsIgnoreCase(args[i], "--tagSeparator")) && (i + 1 < args.Length)) { tagSeparator = args[i + 1]; i++; } else { if ((Sharpen.Runtime.EqualsIgnoreCase(args[i], "-treeRange") || Sharpen.Runtime.EqualsIgnoreCase(args[i], "--treeRange")) && (i + 1 < args.Length)) { treeRange = args[i + 1]; i++; } else { if ((Sharpen.Runtime.EqualsIgnoreCase(args[i], "-inputEncoding") || Sharpen.Runtime.EqualsIgnoreCase(args[i], "--inputEncoding")) && (i + 1 < args.Length)) { inputEncoding = args[i + 1]; i++; } else { if ((Sharpen.Runtime.EqualsIgnoreCase(args[i], "-outputEncoding") || Sharpen.Runtime.EqualsIgnoreCase(args[i], "--outputEncoding")) && (i + 1 < args.Length)) { outputEncoding = args[i + 1]; i++; } else { if ((Sharpen.Runtime.EqualsIgnoreCase(args[i], "-treeFilter") || Sharpen.Runtime.EqualsIgnoreCase(args[i], "--treeFilter")) && (i + 1 < args.Length)) { treeFilter = args[i + 1]; i++; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-noTags") || Sharpen.Runtime.EqualsIgnoreCase(args[i], "--noTags")) { noTags = true; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-noSpaces") || Sharpen.Runtime.EqualsIgnoreCase(args[i], "--noSpaces")) { noSpaces = true; } else { inputFilenames.Add(args[i]); } } } } } } } } } if (outputFilename.Equals(string.Empty)) { log.Info("Must specify an output filename, -output"); System.Environment.Exit(2); } if (inputFilenames.Count == 0) { log.Info("Must specify one or more input filenames"); System.Environment.Exit(2); } FileOutputStream fos = new FileOutputStream(outputFilename); OutputStreamWriter osw = new OutputStreamWriter(fos, outputEncoding); BufferedWriter bout = new BufferedWriter(osw); Properties props = new Properties(); foreach (string filename in inputFilenames) { string description = TaggedFileRecord.Format + "=" + TaggedFileRecord.Format.Trees + "," + filename; if (!treeRange.IsEmpty()) { description = TaggedFileRecord.TreeRange + "=" + treeRange + "," + description; } if (!treeFilter.IsEmpty()) { description = TaggedFileRecord.TreeFilter + "=" + treeFilter + "," + description; } description = TaggedFileRecord.Encoding + "=" + inputEncoding + "," + description; TaggedFileRecord record = TaggedFileRecord.CreateRecord(props, description); foreach (IList <TaggedWord> sentence in record.Reader()) { string output = SentenceUtils.ListToString(sentence, noTags, tagSeparator); if (noSpaces) { output = output.ReplaceAll(" ", string.Empty); } bout.Write(output); bout.NewLine(); } } bout.Flush(); bout.Close(); osw.Close(); fos.Close(); }
/// <summary>Run the scoring metric on guess/gold input.</summary> /// <remarks> /// Run the scoring metric on guess/gold input. This method performs "Collinization." /// The default language is English. /// </remarks> /// <param name="args"/> public static void Main(string[] args) { if (args.Length < minArgs) { System.Console.Out.WriteLine(usage.ToString()); System.Environment.Exit(-1); } ITreebankLangParserParams tlpp = new EnglishTreebankParserParams(); int maxGoldYield = int.MaxValue; int maxGuessYield = int.MaxValue; bool Verbose = false; bool skipGuess = false; bool tagMode = false; string guessFile = null; string goldFile = null; for (int i = 0; i < args.Length; i++) { if (args[i].StartsWith("-")) { switch (args[i]) { case "-l": { Language lang = Language.ValueOf(args[++i].Trim()); tlpp = lang.@params; break; } case "-y": { maxGoldYield = System.Convert.ToInt32(args[++i].Trim()); break; } case "-t": { tagMode = true; break; } case "-v": { Verbose = true; break; } case "-g": { maxGuessYield = System.Convert.ToInt32(args[++i].Trim()); skipGuess = true; break; } default: { System.Console.Out.WriteLine(usage.ToString()); System.Environment.Exit(-1); break; } } } else { //Required parameters goldFile = args[i++]; guessFile = args[i]; break; } } PrintWriter pwOut = tlpp.Pw(); Treebank guessTreebank = tlpp.DiskTreebank(); guessTreebank.LoadPath(guessFile); pwOut.Println("GUESS TREEBANK:"); pwOut.Println(guessTreebank.TextualSummary()); Treebank goldTreebank = tlpp.DiskTreebank(); goldTreebank.LoadPath(goldFile); pwOut.Println("GOLD TREEBANK:"); pwOut.Println(goldTreebank.TextualSummary()); string evalName = (tagMode) ? "TsarfatyTAG" : "TsarfatySEG"; Edu.Stanford.Nlp.Parser.Metrics.TsarfatyEval eval = new Edu.Stanford.Nlp.Parser.Metrics.TsarfatyEval(evalName, tagMode); ITreeTransformer tc = tlpp.Collinizer(); //PennTreeReader skips over null/malformed parses. So when the yields of the gold/guess trees //don't match, we need to keep looking for the next gold tree that matches. //The evalb ref implementation differs slightly as it expects one tree per line. It assigns //status as follows: // // 0 - Ok (yields match) // 1 - length mismatch // 2 - null parse e.g. (()). // //In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation. IEnumerator <Tree> goldItr = goldTreebank.GetEnumerator(); int goldLineId = 0; int skippedGuessTrees = 0; foreach (Tree guess in guessTreebank) { Tree evalGuess = tc.TransformTree(guess); List <ILabel> guessSent = guess.Yield(); string guessChars = SentenceUtils.ListToString(guessSent).ReplaceAll("\\s+", string.Empty); if (guessSent.Count > maxGuessYield) { skippedGuessTrees++; continue; } bool doneEval = false; while (goldItr.MoveNext() && !doneEval) { Tree gold = goldItr.Current; Tree evalGold = tc.TransformTree(gold); goldLineId++; List <ILabel> goldSent = gold.Yield(); string goldChars = SentenceUtils.ListToString(goldSent).ReplaceAll("\\s+", string.Empty); if (goldSent.Count > maxGoldYield) { continue; } else { if (goldChars.Length != guessChars.Length) { pwOut.Printf("Char level yield mismatch at line %d (guess: %d gold: %d)\n", goldLineId, guessChars.Length, goldChars.Length); skippedGuessTrees++; break; } } //Default evalb behavior -- skip this guess tree eval.Evaluate(evalGuess, evalGold, ((Verbose) ? pwOut : null)); doneEval = true; } } //Move to the next guess parse pwOut.Println("================================================================================"); if (skippedGuessTrees != 0) { pwOut.Printf("%s %d guess trees\n", ((skipGuess) ? "Skipped" : "Unable to evaluate"), skippedGuessTrees); } eval.Display(true, pwOut); pwOut.Println(); pwOut.Close(); }
public static void Main(string[] args) { // Args specified with -tagSeparator, -encoding, etc are assigned // to the appropriate option. Otherwise, the first arg found is // the sentence to look for, and all other args are paths in which // to look for that sentence. string needle = string.Empty; string tagSeparator = "_"; string encoding = "utf-8"; string fileRegex = string.Empty; IList <string> paths = new List <string>(); for (int i = 0; i < args.Length; ++i) { if ((Sharpen.Runtime.EqualsIgnoreCase(args[i], "-tagSeparator") || Sharpen.Runtime.EqualsIgnoreCase(args[i], "--tagSeparator")) && i + 1 < args.Length) { tagSeparator = args[i + 1]; ++i; } else { if ((Sharpen.Runtime.EqualsIgnoreCase(args[i], "-encoding") || Sharpen.Runtime.EqualsIgnoreCase(args[i], "--encoding")) && i + 1 < args.Length) { encoding = args[i + 1]; ++i; } else { if ((Sharpen.Runtime.EqualsIgnoreCase(args[i], "-fileRegex") || Sharpen.Runtime.EqualsIgnoreCase(args[i], "--fileRegex")) && i + 1 < args.Length) { fileRegex = args[i + 1]; ++i; } else { if (needle.Equals(string.Empty)) { needle = args[i].Trim(); } else { paths.Add(args[i]); } } } } } ITreeReaderFactory trf = new LabeledScoredTreeReaderFactory(); // If the user specified a regex, here we make a filter using that // regex. We just use an anonymous class for the filter IFileFilter filter = null; if (!fileRegex.Equals(string.Empty)) { Pattern filePattern = Pattern.Compile(fileRegex); filter = null; } foreach (string path in paths) { // Start a new treebank with the given path, encoding, filter, etc DiskTreebank treebank = new DiskTreebank(trf, encoding); treebank.LoadPath(path, filter); IEnumerator <Tree> treeIterator = treebank.GetEnumerator(); int treeCount = 0; string currentFile = string.Empty; while (treeIterator.MoveNext()) { // the treebank might be a directory, not a single file, so // keep track of which file we are currently looking at if (!currentFile.Equals(treebank.GetCurrentFilename())) { currentFile = treebank.GetCurrentFilename(); treeCount = 0; } ++treeCount; Tree tree = treeIterator.Current; IList <TaggedWord> sentence = tree.TaggedYield(); bool found = false; // The tree can match in one of three ways: tagged, untagged, // or untagged and unsegmented (which is useful for Chinese, // for example) string haystack = SentenceUtils.ListToString(sentence, true); found = needle.Equals(haystack); haystack = haystack.ReplaceAll(" ", string.Empty); found = found || needle.Equals(haystack); haystack = SentenceUtils.ListToString(sentence, false, tagSeparator); found = found || needle.Equals(haystack); if (found) { System.Console.Out.WriteLine("needle found in " + currentFile + " tree " + treeCount); } } } }
/// <summary> /// arg[0] := tokenizer options /// args[1] := file to tokenize /// </summary> /// <param name="args"/> public static void Main(string[] args) { if (args.Length != 2) { System.Console.Out.Printf("Usage: java %s OPTS filename%n", typeof(ArabicTokenizerTester).FullName); System.Environment.Exit(-1); } string tokOptions = args[0]; File path = new File(args[1]); log.Info("Reading from: " + path.GetPath()); try { BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(path), "UTF-8")); ITokenizerFactory <CoreLabel> tf = ArabicTokenizer.Factory(); tf.SetOptions(tokOptions); IMapper lexMapper = new DefaultLexicalMapper(); lexMapper.Setup(null, "StripSegMarkersInUTF8", "StripMorphMarkersInUTF8"); int lineId = 0; for (string line; (line = br.ReadLine()) != null; lineId++) { line = line.Trim(); // Tokenize with the tokenizer IList <CoreLabel> tokenizedLine = tf.GetTokenizer(new StringReader(line)).Tokenize(); System.Console.Out.WriteLine(SentenceUtils.ListToString(tokenizedLine)); // Tokenize with the mapper StringBuilder sb = new StringBuilder(); string[] toks = line.Split("\\s+"); foreach (string tok in toks) { string mappedTok = lexMapper.Map(null, tok); sb.Append(mappedTok).Append(" "); } IList <string> mappedToks = Arrays.AsList(sb.ToString().Trim().Split("\\s+")); // Evaluate the output if (mappedToks.Count != tokenizedLine.Count) { System.Console.Error.Printf("Line length mismatch:%norig: %s%ntok: %s%nmap: %s%n%n", line, SentenceUtils.ListToString(tokenizedLine), SentenceUtils.ListToString(mappedToks)); } else { bool printLines = false; for (int i = 0; i < mappedToks.Count; ++i) { string mappedTok = mappedToks[i]; string tokenizedTok = tokenizedLine[i].Word(); if (!mappedTok.Equals(tokenizedTok)) { System.Console.Error.Printf("Token mismatch:%nmap: %s%ntok: %s%n", mappedTok, tokenizedTok); printLines = true; } } if (printLines) { System.Console.Error.Printf("orig: %s%ntok: %s%nmap: %s%n%n", line, SentenceUtils.ListToString(tokenizedLine), SentenceUtils.ListToString(mappedToks)); } } } System.Console.Error.Printf("Read %d lines.%n", lineId); } catch (UnsupportedEncodingException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (FileNotFoundException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); } }
private IList <Tree> DoOneSentence(IList <ParserConstraint> constraints, IList <CoreLabel> words) { IParserQuery pq = parser.ParserQuery(); pq.SetConstraints(constraints); pq.Parse(words); IList <Tree> trees = Generics.NewLinkedList(); try { // Use bestParse if kBest is set to 1. if (this.kBest == 1) { Tree t = pq.GetBestParse(); if (t == null) { log.Warn("Parsing of sentence failed. " + "Will ignore and continue: " + SentenceUtils.ListToString(words)); } else { double score = pq.GetBestScore(); t.SetScore(score % -10000.0); trees.Add(t); } } else { IList <ScoredObject <Tree> > scoredObjects = pq.GetKBestParses(this.kBest); if (scoredObjects == null || scoredObjects.Count < 1) { log.Warn("Parsing of sentence failed. " + "Will ignore and continue: " + SentenceUtils.ListToString(words)); } else { foreach (ScoredObject <Tree> so in scoredObjects) { // -10000 denotes unknown words Tree tree = so.Object(); tree.SetScore(so.Score() % -10000.0); trees.Add(tree); } } } } catch (OutOfMemoryException e) { log.Error(e); // Beware that we can now get an OOM in logging, too. log.Warn("Parsing of sentence ran out of memory (length=" + words.Count + "). " + "Will ignore and try to continue."); } catch (NoSuchParseException) { log.Warn("Parsing of sentence failed, possibly because of out of memory. " + "Will ignore and continue: " + SentenceUtils.ListToString(words)); } return(trees); }
/// <summary>TODO: clearly this should be a default method in ParserQuery once Java 8 comes out</summary> public virtual void RestoreOriginalWords(Tree tree) { if (originalSentence == null || tree == null) { return; } IList <Tree> leaves = tree.GetLeaves(); if (leaves.Count != originalSentence.Count) { throw new InvalidOperationException("originalWords and sentence of different sizes: " + originalSentence.Count + " vs. " + leaves.Count + "\n Orig: " + SentenceUtils.ListToString(originalSentence) + "\n Pars: " + SentenceUtils.ListToString(leaves )); } // TODO: get rid of this cast IEnumerator <ILabel> wordsIterator = (IEnumerator <ILabel>)originalSentence.GetEnumerator(); foreach (Tree leaf in leaves) { leaf.SetLabel(wordsIterator.Current); } }