/// <exception cref="System.IO.IOException"/> /// <exception cref="System.TypeLoadException"/> private static void DemonstrateSerializationColumnDataClassifier() { System.Console.Out.WriteLine(); System.Console.Out.WriteLine("Demonstrating working with a serialized classifier using serializeTo"); ColumnDataClassifier cdc = new ColumnDataClassifier(where + "examples/cheese2007.prop"); cdc.TrainClassifier(where + "examples/cheeseDisease.train"); // Exhibit serialization and deserialization working. Serialized to bytes in memory for simplicity System.Console.Out.WriteLine(); ByteArrayOutputStream baos = new ByteArrayOutputStream(); ObjectOutputStream oos = new ObjectOutputStream(baos); cdc.SerializeClassifier(oos); oos.Close(); byte[] @object = baos.ToByteArray(); ByteArrayInputStream bais = new ByteArrayInputStream(@object); ObjectInputStream ois = new ObjectInputStream(bais); ColumnDataClassifier cdc2 = ColumnDataClassifier.GetClassifier(ois); ois.Close(); // We compare the output of the deserialized classifier cdc2 versus the original one cl // For both we use a ColumnDataClassifier to convert text lines to examples System.Console.Out.WriteLine("Making predictions with both classifiers"); foreach (string line in ObjectBank.GetLineIterator(where + "examples/cheeseDisease.test", "utf-8")) { IDatum <string, string> d = cdc.MakeDatumFromLine(line); IDatum <string, string> d2 = cdc2.MakeDatumFromLine(line); System.Console.Out.Printf("%s =origi=> %s (%.4f)%n", line, cdc.ClassOf(d), cdc.ScoresOf(d).GetCount(cdc.ClassOf(d))); System.Console.Out.Printf("%s =deser=> %s (%.4f)%n", line, cdc2.ClassOf(d2), cdc2.ScoresOf(d).GetCount(cdc2.ClassOf(d))); } }
protected internal override ICollection <IList <IN> > LoadAuxiliaryData(ICollection <IList <IN> > docs, IDocumentReaderAndWriter <IN> readerAndWriter) { if (flags.unsupDropoutFile != null) { log.Info("Reading unsupervised dropout data from file: " + flags.unsupDropoutFile); Timing timer = new Timing(); timer.Start(); unsupDocs = new List <IList <IN> >(); ObjectBank <IList <IN> > unsupObjBank = MakeObjectBankFromFile(flags.unsupDropoutFile, readerAndWriter); foreach (IList <IN> doc in unsupObjBank) { foreach (IN tok in doc) { tok.Set(typeof(CoreAnnotations.AnswerAnnotation), flags.backgroundSymbol); tok.Set(typeof(CoreAnnotations.GoldAnswerAnnotation), flags.backgroundSymbol); } unsupDocs.Add(doc); } long elapsedMs = timer.Stop(); log.Info("Time to read: : " + Timing.ToSecondsString(elapsedMs) + " seconds"); } if (unsupDocs != null && flags.doFeatureDiscovery) { IList <IList <IN> > totalDocs = new List <IList <IN> >(); Sharpen.Collections.AddAll(totalDocs, docs); Sharpen.Collections.AddAll(totalDocs, unsupDocs); return(totalDocs); } else { return(docs); } }
/// <exception cref="System.Exception"/> public static void Main(string[] args) { if (args.Length > 0) { where = args[0] + File.separator; } System.Console.Out.WriteLine("Training ColumnDataClassifier"); ColumnDataClassifier cdc = new ColumnDataClassifier(where + "examples/cheese2007.prop"); cdc.TrainClassifier(where + "examples/cheeseDisease.train"); System.Console.Out.WriteLine(); System.Console.Out.WriteLine("Testing predictions of ColumnDataClassifier"); foreach (string line in ObjectBank.GetLineIterator(where + "examples/cheeseDisease.test", "utf-8")) { // instead of the method in the line below, if you have the individual elements // already you can use cdc.makeDatumFromStrings(String[]) IDatum <string, string> d = cdc.MakeDatumFromLine(line); System.Console.Out.Printf("%s ==> %s (%.4f)%n", line, cdc.ClassOf(d), cdc.ScoresOf(d).GetCount(cdc.ClassOf(d))); } System.Console.Out.WriteLine(); System.Console.Out.WriteLine("Testing accuracy of ColumnDataClassifier"); Pair <double, double> performance = cdc.TestClassifier(where + "examples/cheeseDisease.test"); System.Console.Out.Printf("Accuracy: %.3f; macro-F1: %.3f%n", performance.First(), performance.Second()); DemonstrateSerialization(); DemonstrateSerializationColumnDataClassifier(); }
// end class CoNLLIterator private static IEnumerator <string> SplitIntoDocs(Reader r) { ICollection <string> docs = new List <string>(); ObjectBank <string> ob = ObjectBank.GetLineIterator(r); StringBuilder current = new StringBuilder(); Matcher matcher = docPattern.Matcher(string.Empty); foreach (string line in ob) { if (matcher.Reset(line).LookingAt()) { // Start new doc, store old one if non-empty if (current.Length > 0) { docs.Add(current.ToString()); current.Length = 0; } } current.Append(line).Append('\n'); } if (current.Length > 0) { docs.Add(current.ToString()); } return(docs.GetEnumerator()); }
/// <summary>This runs a simple train and test regime.</summary> /// <remarks> /// This runs a simple train and test regime. /// The data file format is one item per line, space separated, with first the class label /// and then a bunch of (categorical) string features. /// </remarks> /// <param name="args">The arguments/flags are: -trainFile trainFile -testFile testFile [-l1reg num] [-biased]</param> /// <exception cref="System.Exception"/> public static void Main(string[] args) { Properties prop = StringUtils.ArgsToProperties(args); double l1reg = double.ParseDouble(prop.GetProperty("l1reg", "0.0")); Dataset <string, string> ds = new Dataset <string, string>(); foreach (string line in ObjectBank.GetLineIterator(new File(prop.GetProperty("trainFile")))) { string[] bits = line.Split("\\s+"); ICollection <string> f = new LinkedList <string>(Arrays.AsList(bits).SubList(1, bits.Length)); string l = bits[0]; ds.Add(f, l); } ds.SummaryStatistics(); bool biased = prop.GetProperty("biased", "false").Equals("true"); LogisticClassifierFactory <string, string> factory = new LogisticClassifierFactory <string, string>(); Edu.Stanford.Nlp.Classify.LogisticClassifier <string, string> lc = factory.TrainClassifier(ds, l1reg, 1e-4, biased); foreach (string line_1 in ObjectBank.GetLineIterator(new File(prop.GetProperty("testFile")))) { string[] bits = line_1.Split("\\s+"); ICollection <string> f = new LinkedList <string>(Arrays.AsList(bits).SubList(1, bits.Length)); //String l = bits[0]; string g = lc.ClassOf(f); double prob = lc.ProbabilityOf(f, g); System.Console.Out.Printf("%4.3f\t%s\t%s%n", prob, g, line_1); } }
private static IDictionary <string, string> LoadMixedCaseMap(string mapFile) { IDictionary <string, string> map = Generics.NewHashMap(); try { using (BufferedReader br = IOUtils.ReaderFromString(mapFile)) { foreach (string line in ObjectBank.GetLineIterator(br)) { line = line.Trim(); string[] els = line.Split("\\s+"); if (els.Length != 2) { throw new Exception("Wrong format: " + mapFile); } map[els[0]] = els[1]; } } } catch (IOException e) { throw new RuntimeIOException(e); } return(map); }
/// <param name="normalizationTableFile"> /// A file listing character pairs for /// normalization. Currently the normalization table must be in UTF-8. /// If this parameter is /// <see langword="null"/> /// , the default normalization /// of the zero-argument constructor is used. /// </param> public ChineseDocumentToSentenceProcessor(string normalizationTableFile) { // todo: This class is a mess. We should try to get it out of core // not \uff0e . (too often separates English first/last name, etc.) // private final String normalizationTableFile; // this.normalizationTableFile = normalizationTableFile; if (normalizationTableFile != null) { normalizationTable = new List <Pair <string, string> >(); foreach (string line in ObjectBank.GetLineIterator(new File(normalizationTableFile), encoding)) { Matcher pairMatcher = PairPattern.Matcher(line); if (pairMatcher.Find()) { normalizationTable.Add(new Pair <string, string>(pairMatcher.Group(1), pairMatcher.Group(2))); } else { log.Info("Didn't match: " + line); } } } else { normalizationTable = null; } }
public ObjectBankWrapper(SeqClassifierFlags flags, ObjectBank <IList <In> > wrapped, ICollection <string> knownLCWords) : base(null, null) { this.flags = flags; this.wrapped = wrapped; this.knownLCWords = knownLCWords; }
// static demo class /// <exception cref="System.IO.IOException"/> public static void Main(string[] args) { if (args.Length < 2) { System.Console.Error.WriteLine("TokensRegexMatcher rules file [outFile]"); return; } string rules = args[0]; PrintWriter @out; if (args.Length > 2) { @out = new PrintWriter(args[2]); } else { @out = new PrintWriter(System.Console.Out); } StanfordCoreNLP pipeline = new StanfordCoreNLP(PropertiesUtils.AsProperties("annotators", "tokenize,ssplit,pos,lemma,ner")); Annotation annotation = new Annotation(IOUtils.SlurpFileNoExceptions(args[1])); pipeline.Annotate(annotation); // Load lines of file as TokenSequencePatterns IList <TokenSequencePattern> tokenSequencePatterns = new List <TokenSequencePattern>(); foreach (string line in ObjectBank.GetLineIterator(rules)) { TokenSequencePattern pattern = TokenSequencePattern.Compile(line); tokenSequencePatterns.Add(pattern); } IList <ICoreMap> sentences = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)); int i = 0; foreach (ICoreMap sentence in sentences) { IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)); @out.Println("Sentence #" + ++i); @out.Print(" Tokens:"); foreach (CoreLabel token in tokens) { @out.Print(' '); @out.Print(token.ToShortString("Text", "PartOfSpeech", "NamedEntityTag")); } @out.Println(); MultiPatternMatcher <ICoreMap> multiMatcher = TokenSequencePattern.GetMultiPatternMatcher(tokenSequencePatterns); IList <ISequenceMatchResult <ICoreMap> > answers = multiMatcher.FindNonOverlapping(tokens); int j = 0; foreach (ISequenceMatchResult <ICoreMap> matched in answers) { @out.Println(" Match #" + ++j); for (int k = 0; k <= matched.GroupCount(); k++) { @out.Println(" group " + k + " = " + matched.Group(k)); } } } @out.Flush(); }
void Start() { generator = GetComponent <MazeConstructor>(); objectBank = FindObjectOfType <ObjectBank>(); trees = new GameObject("Trees"); arbors = new GameObject("Arbors"); banks = new GameObject("Banks"); StartNewGame(); }
/// <summary>Train a segmenter from raw text.</summary> /// <remarks>Train a segmenter from raw text. Gold segmentation markers are required.</remarks> public virtual void Train() { bool hasSegmentationMarkers = true; bool hasTags = true; IDocumentReaderAndWriter <CoreLabel> docReader = new ArabicDocumentReaderAndWriter(hasSegmentationMarkers, hasTags, hasDomainLabels, domain, noRewrites, tf); ObjectBank <IList <CoreLabel> > lines = classifier.MakeObjectBankFromFile(flags.trainFile, docReader); classifier.Train(lines, docReader); log.Info("Finished training."); }
/// <summary> /// The Extractor argument extraction keeps ; together, so we use /// that to delimit options. /// </summary> /// <remarks> /// The Extractor argument extraction keeps ; together, so we use /// that to delimit options. Actually, the only option supported is /// mapdigits, which tells the Distsim to try mapping [0-9] to 0 and /// requery for an unknown word with digits. /// </remarks> public Distsim(string path) { // Avoid loading the same lexicon twice but allow different lexicons // TODO: when loading a distsim, should we populate this map? // = false // = false; string[] pieces = path.Split(";"); string filename = pieces[0]; for (int arg = 1; arg < pieces.Length; ++arg) { if (Sharpen.Runtime.EqualsIgnoreCase(pieces[arg], "mapdigits")) { mapdigits = true; } else { if (Sharpen.Runtime.EqualsIgnoreCase(pieces[arg], "casedDistSim")) { casedDistSim = true; } else { throw new ArgumentException("Unknown argument " + pieces[arg]); } } } lexicon = Generics.NewHashMap(); // todo [cdm 2016]: Note that this loads file with default file encoding rather than specifying it foreach (string word in ObjectBank.GetLineIterator(new File(filename))) { string[] bits = word.Split("\\s+"); string w = bits[0]; if (!casedDistSim) { w = w.ToLower(); } lexicon[w] = bits[1]; } if (lexicon.Contains("<unk>")) { unk = lexicon["<unk>"]; } else { unk = "null"; } }
private void ReadSRLFile(string srlFile) { srlMap = Generics.NewHashMap(); foreach (string line in ObjectBank.GetLineIterator(new File(srlFile))) { string[] bits = line.Split("\\s+", 3); string filename = bits[0]; int treeNum = System.Convert.ToInt32(bits[1]); string info = bits[2]; CollectionValuedMap <int, string> cvm = srlMap[filename]; if (cvm == null) { cvm = new CollectionValuedMap <int, string>(); srlMap[filename] = cvm; } cvm.Add(treeNum, info); } }
/// <summary>Read the data as a list of RVFDatum objects.</summary> /// <remarks>Read the data as a list of RVFDatum objects. For the test set we must reuse the indices from the training set</remarks> internal static List <RVFDatum <string, int> > ReadData(string filename, IDictionary <int, IIndex <string> > indices) { try { string sep = ", "; List <RVFDatum <string, int> > examples = new List <RVFDatum <string, int> >(); foreach (string line in ObjectBank.GetLineIterator(new File(filename))) { RVFDatum <string, int> next = ReadDatum(line, sep, indices); examples.Add(next); } return(examples); } catch (Exception e) { Sharpen.Runtime.PrintStackTrace(e); } return(null); }
public virtual void TestUsingIterator() { string s = "\n\n@@123\nthis\nis\na\nsentence\n\n@@12\nThis\nis another\n.\n\n"; string[] output = new string[] { "@@", "123", "this", "is", "a", "sentence", "@@", "12", "This", "is", "another", "." }; string[] outWSs = new string[] { "@@", "ddd", "xxxx", "xx", "x", "xxxxx", "@@", "dd", "Xxxx", "xx", "xxxxx", "." }; NUnit.Framework.Assert.AreEqual(output.Length, outWSs.Length, "Two output arrays should have same length"); Properties props = PropertiesUtils.AsProperties("wordShape", "chris2"); SeqClassifierFlags flags = new SeqClassifierFlags(props); PlainTextDocumentReaderAndWriter <CoreLabel> readerAndWriter = new PlainTextDocumentReaderAndWriter <CoreLabel>(); readerAndWriter.Init(flags); ReaderIteratorFactory rif = new ReaderIteratorFactory(new StringReader(s)); ObjectBank <IList <CoreLabel> > di = new ObjectBank <IList <CoreLabel> >(rif, readerAndWriter); ICollection <string> knownLCWords = new HashSet <string>(); ObjectBankWrapper <CoreLabel> obw = new ObjectBankWrapper <CoreLabel>(flags, di, knownLCWords); try { int outIdx = 0; for (IEnumerator <IList <CoreLabel> > iter = obw.GetEnumerator(); iter.MoveNext();) { IList <CoreLabel> sent = iter.Current; for (IEnumerator <CoreLabel> iter2 = sent.GetEnumerator(); iter2.MoveNext();) { CoreLabel cl = iter2.Current; string tok = cl.Word(); string shape = cl.Get(typeof(CoreAnnotations.ShapeAnnotation)); NUnit.Framework.Assert.AreEqual(output[outIdx], tok); NUnit.Framework.Assert.AreEqual(outWSs[outIdx], shape); outIdx++; } } if (outIdx < output.Length) { NUnit.Framework.Assert.Fail("Too few things in iterator, lacking: " + output[outIdx]); } } catch (Exception e) { NUnit.Framework.Assert.Fail("Probably too many things in iterator: " + e); } }
public DistSimClassifier(string filename, string format, string encoding, int distSimMaxBits, bool cased, bool numberEquivalence, string unknownWordClass) { this.cased = cased; this.numberEquivalence = numberEquivalence; this.unknownWordClass = unknownWordClass; Timing.StartDoing("Loading distsim lexicon from " + filename); lexicon = Generics.NewHashMap(1 << 15); // make a reasonable starting size bool terryKoo = "terryKoo".Equals(format); foreach (string line in ObjectBank.GetLineIterator(filename, encoding)) { string word; string wordClass; if (terryKoo) { string[] bits = line.Split("\\t"); word = bits[1]; wordClass = bits[0]; if (distSimMaxBits > 0 && wordClass.Length > distSimMaxBits) { wordClass = Sharpen.Runtime.Substring(wordClass, 0, distSimMaxBits); } } else { // "alexClark" string[] bits = line.Split("\\s+"); word = bits[0]; wordClass = bits[1]; } if (!cased) { word = word.ToLower(); } if (numberEquivalence) { word = WordShapeClassifier.WordShape(word, WordShapeClassifier.Wordshapedigits); } lexicon[word] = wordClass; } Timing.EndDoing(); }
/// <summary>Constructs a Dataset by reading in a file in SVM light format.</summary> /// <remarks> /// Constructs a Dataset by reading in a file in SVM light format. /// the created dataset has the same feature and label index as given /// </remarks> public static Edu.Stanford.Nlp.Classify.Dataset <string, string> ReadSVMLightFormat(string filename, IIndex <string> featureIndex, IIndex <string> labelIndex, IList <string> lines) { Edu.Stanford.Nlp.Classify.Dataset <string, string> dataset; try { dataset = new Edu.Stanford.Nlp.Classify.Dataset <string, string>(10, featureIndex, labelIndex); foreach (string line in ObjectBank.GetLineIterator(new File(filename))) { if (lines != null) { lines.Add(line); } dataset.Add(SvmLightLineToDatum(line)); } } catch (Exception e) { throw new Exception(e); } return(dataset); }
/// <summary>reads scores with classes from a file, sorts by score and creates the arrays</summary> public PRCurve(string filename) { //sorted scores // the class of example i // the guess of example i according to the argmax // number positive in the i-th highest scores // number negative in the i-th lowest scores try { List <Pair <double, int> > dataScores = new List <Pair <double, int> >(); foreach (string line in ObjectBank.GetLineIterator(new File(filename))) { IList <string> elems = StringUtils.Split(line); Pair <double, int> p = new Pair <double, int>(double.ValueOf(elems[0]), int.Parse(elems[1])); dataScores.Add(p); } Init(dataScores); } catch (Exception e) { Sharpen.Runtime.PrintStackTrace(e); } }
/// <summary>reads scores with classes from a file, sorts by score and creates the arrays</summary> public PRCurve(string filename, bool svm) { try { List <Pair <double, int> > dataScores = new List <Pair <double, int> >(); foreach (string line in ObjectBank.GetLineIterator(new File(filename))) { IList <string> elems = StringUtils.Split(line); int cls = double.ValueOf(elems[0]); if (cls == -1) { cls = 0; } double score = double.ValueOf(elems[1]) + 0.5; Pair <double, int> p = new Pair <double, int>(score, int.Parse(cls)); dataScores.Add(p); } Init(dataScores); } catch (Exception e) { Sharpen.Runtime.PrintStackTrace(e); } }
/// <summary> /// Evaluate accuracy when the input is gold segmented text *with* segmentation /// markers and morphological analyses. /// </summary> /// <remarks> /// Evaluate accuracy when the input is gold segmented text *with* segmentation /// markers and morphological analyses. In other words, the evaluation file has the /// same format as the training data. /// </remarks> /// <param name="pwOut"/> private void Evaluate(PrintWriter pwOut) { log.Info("Starting evaluation..."); bool hasSegmentationMarkers = true; bool hasTags = true; IDocumentReaderAndWriter <CoreLabel> docReader = new ArabicDocumentReaderAndWriter(hasSegmentationMarkers, hasTags, hasDomainLabels, domain, tf); ObjectBank <IList <CoreLabel> > lines = classifier.MakeObjectBankFromFile(flags.testFile, docReader); PrintWriter tedEvalGoldTree = null; PrintWriter tedEvalParseTree = null; PrintWriter tedEvalGoldSeg = null; PrintWriter tedEvalParseSeg = null; if (tedEvalPrefix != null) { try { tedEvalGoldTree = new PrintWriter(tedEvalPrefix + "_gold.ftree"); tedEvalGoldSeg = new PrintWriter(tedEvalPrefix + "_gold.segmentation"); tedEvalParseTree = new PrintWriter(tedEvalPrefix + "_parse.ftree"); tedEvalParseSeg = new PrintWriter(tedEvalPrefix + "_parse.segmentation"); } catch (FileNotFoundException e) { System.Console.Error.Printf("%s: %s%n", typeof(Edu.Stanford.Nlp.International.Arabic.Process.ArabicSegmenter).FullName, e.Message); } } ICounter <string> labelTotal = new ClassicCounter <string>(); ICounter <string> labelCorrect = new ClassicCounter <string>(); int total = 0; int correct = 0; foreach (IList <CoreLabel> line in lines) { string[] inputTokens = TedEvalSanitize(IOBUtils.IOBToString(line).ReplaceAll(":", "#pm#")).Split(" "); string[] goldTokens = TedEvalSanitize(IOBUtils.IOBToString(line, ":")).Split(" "); line = classifier.Classify(line); string[] parseTokens = TedEvalSanitize(IOBUtils.IOBToString(line, ":")).Split(" "); foreach (CoreLabel label in line) { // Do not evaluate labeling of whitespace string observation = label.Get(typeof(CoreAnnotations.CharAnnotation)); if (!observation.Equals(IOBUtils.GetBoundaryCharacter())) { total++; string hypothesis = label.Get(typeof(CoreAnnotations.AnswerAnnotation)); string reference = label.Get(typeof(CoreAnnotations.GoldAnswerAnnotation)); labelTotal.IncrementCount(reference); if (hypothesis.Equals(reference)) { correct++; labelCorrect.IncrementCount(reference); } } } if (tedEvalParseSeg != null) { tedEvalGoldTree.Printf("(root"); tedEvalParseTree.Printf("(root"); int safeLength = inputTokens.Length; if (inputTokens.Length != goldTokens.Length) { log.Info("In generating TEDEval files: Input and gold do not have the same number of tokens"); log.Info(" (ignoring any extras)"); log.Info(" input: " + Arrays.ToString(inputTokens)); log.Info(" gold: " + Arrays.ToString(goldTokens)); safeLength = Math.Min(inputTokens.Length, goldTokens.Length); } if (inputTokens.Length != parseTokens.Length) { log.Info("In generating TEDEval files: Input and parse do not have the same number of tokens"); log.Info(" (ignoring any extras)"); log.Info(" input: " + Arrays.ToString(inputTokens)); log.Info(" parse: " + Arrays.ToString(parseTokens)); safeLength = Math.Min(inputTokens.Length, parseTokens.Length); } for (int i = 0; i < safeLength; i++) { foreach (string segment in goldTokens[i].Split(":")) { tedEvalGoldTree.Printf(" (seg %s)", segment); } tedEvalGoldSeg.Printf("%s\t%s%n", inputTokens[i], goldTokens[i]); foreach (string segment_1 in parseTokens[i].Split(":")) { tedEvalParseTree.Printf(" (seg %s)", segment_1); } tedEvalParseSeg.Printf("%s\t%s%n", inputTokens[i], parseTokens[i]); } tedEvalGoldTree.Printf(")%n"); tedEvalGoldSeg.Println(); tedEvalParseTree.Printf(")%n"); tedEvalParseSeg.Println(); } } double accuracy = ((double)correct) / ((double)total); accuracy *= 100.0; pwOut.Println("EVALUATION RESULTS"); pwOut.Printf("#datums:\t%d%n", total); pwOut.Printf("#correct:\t%d%n", correct); pwOut.Printf("accuracy:\t%.2f%n", accuracy); pwOut.Println("=================="); // Output the per label accuracies pwOut.Println("PER LABEL ACCURACIES"); foreach (string refLabel in labelTotal.KeySet()) { double nTotal = labelTotal.GetCount(refLabel); double nCorrect = labelCorrect.GetCount(refLabel); double acc = (nCorrect / nTotal) * 100.0; pwOut.Printf(" %s\t%.2f%n", refLabel, acc); } if (tedEvalParseSeg != null) { tedEvalGoldTree.Close(); tedEvalGoldSeg.Close(); tedEvalParseTree.Close(); tedEvalParseSeg.Close(); } }
public virtual SemanticGraph Apply(string line) { if (line == null) { return(null); } IFunction <string, IndexedWord> func = new CoNLLUDocumentReader.WordProcessor(); ObjectBank <IndexedWord> words = ObjectBank.GetLineIterator(new StringReader(line), func); IList <IndexedWord> wordList = new List <IndexedWord>(words); IList <IndexedWord> sorted = new List <IndexedWord>(wordList.Count); IList <string> comments = new LinkedList <string>(); /* Increase the line number in case there are comments before the actual sentence * and add them to the list of comments. */ wordList.Stream().Filter(null).ForEach(null); wordList.Stream().Filter(null).Sorted(byIndex.ThenComparing(byType)).ForEach(null); IList <IndexedWord> sortedTokens = new List <IndexedWord>(wordList.Count); sorted.Stream().Filter(null).Filter(null).ForEach(null); sorted.Stream().Filter(null).Filter(null).ForEach(null); /* Construct a semantic graph. */ IList <TypedDependency> deps = new List <TypedDependency>(sorted.Count); IntPair tokenSpan = null; string originalToken = null; foreach (IndexedWord word in sorted) { lineNumberCounter++; if (word.ContainsKey(typeof(CoreAnnotations.CoNLLUTokenSpanAnnotation))) { tokenSpan = word.Get(typeof(CoreAnnotations.CoNLLUTokenSpanAnnotation)); originalToken = word.Word(); } else { /* Deal with multiword tokens. */ if (tokenSpan != null && tokenSpan.GetTarget() >= word.Index()) { word.SetOriginalText(originalToken); word.Set(typeof(CoreAnnotations.CoNLLUTokenSpanAnnotation), tokenSpan); } else { tokenSpan = null; originalToken = null; } Dictionary <string, string> extraDeps = word.Get(typeof(CoreAnnotations.CoNLLUSecondaryDepsAnnotation)); if (extraDeps.IsEmpty()) { int govIdx = word.Get(typeof(CoreAnnotations.CoNLLDepParentIndexAnnotation)); Pair <IndexedWord, GrammaticalRelation> govReln = GetGovAndReln(govIdx, 0, word, word.Get(typeof(CoreAnnotations.CoNLLDepTypeAnnotation)), sortedTokens); IndexedWord gov = govReln.First(); GrammaticalRelation reln = govReln.Second(); TypedDependency dep = new TypedDependency(reln, gov, word); word.Set(typeof(CoreAnnotations.LineNumberAnnotation), lineNumberCounter); deps.Add(dep); } else { foreach (string extraGovIdxStr in extraDeps.Keys) { if (extraGovIdxStr.Contains(".")) { string[] indexParts = extraGovIdxStr.Split("\\."); int extraGovIdx = System.Convert.ToInt32(indexParts[0]); int copyCount = System.Convert.ToInt32(indexParts[1]); Pair <IndexedWord, GrammaticalRelation> govReln = GetGovAndReln(extraGovIdx, copyCount, word, extraDeps[extraGovIdxStr], sortedTokens); IndexedWord gov = govReln.First(); GrammaticalRelation reln = govReln.Second(); TypedDependency dep = new TypedDependency(reln, gov, word); dep.SetExtra(); deps.Add(dep); } else { int extraGovIdx = System.Convert.ToInt32(extraGovIdxStr); int mainGovIdx = word.Get(typeof(CoreAnnotations.CoNLLDepParentIndexAnnotation)) != null?word.Get(typeof(CoreAnnotations.CoNLLDepParentIndexAnnotation)) : -1; Pair <IndexedWord, GrammaticalRelation> govReln = GetGovAndReln(extraGovIdx, 0, word, extraDeps[extraGovIdxStr], sortedTokens); IndexedWord gov = govReln.First(); GrammaticalRelation reln = govReln.Second(); TypedDependency dep = new TypedDependency(reln, gov, word); if (extraGovIdx != mainGovIdx) { dep.SetExtra(); } deps.Add(dep); } } } } } lineNumberCounter++; SemanticGraph sg = new SemanticGraph(deps); comments.ForEach(null); return(sg); }