/// <summary> /// Returns the next object. Calling this method repeatedly until it returns, /// null will return each object from the underlying source exactly once. /// </summary> /// <returns> /// The next object or null to signal that the stream is exhausted. /// </returns> public NameSample Read() { AdSentence paragraph; if (monitor != null) { monitor.Token.ThrowIfCancellationRequested(); } while ((paragraph = adSentenceStream.Read()) != null) { var clearData = false; var currentTextID = GetTextId(paragraph); if (currentTextID != textId) { clearData = true; textId = currentTextID; } var root = paragraph.Root; var sentence = new List <string>(); var names = new List <Span>(); Process(root, sentence, names); return(new NameSample(sentence.ToArray(), names.ToArray(), clearData)); } return(null); }
/// <summary> /// Returns the next object. Calling this method repeatedly until it returns, /// null will return each object from the underlying source exactly once. /// </summary> /// <returns> /// The next object or null to signal that the stream is exhausted. /// </returns> public Sequence Read() { var sample = psi.Read(); if (sample != null) { var events = new Event[sample.Sentence.Length]; for (int i = 0; i < sample.Sentence.Length; i++) { // it is safe to pass the tags as previous tags because // the context generator does not look for non predicted tags var tags = seqCodec.Encode(sample.Names, sample.Sentence.Length); var context = pcg.GetContext( i, sample.Sentence, useOutcomes ? tags : null, null); events[i] = new Event(tags[i], context); } return(new Sequence(events, sample)); } return(null); }
private List <ComparableEvent> Index( IObjectStream <Event> indexEventStream, Dictionary <string, int> predicateIndex) { var map = new Dictionary <string, int>(); var indexedContext = new List <int>(); var eventsToCompare = new List <ComparableEvent>(); int outcomeCount = 0; Event ev; while ((ev = indexEventStream.Read()) != null) { int ocId; if (Monitor != null && Monitor.Token.CanBeCanceled) { Monitor.Token.ThrowIfCancellationRequested(); } if (map.ContainsKey(ev.Outcome)) { ocId = map[ev.Outcome]; } else { ocId = outcomeCount++; map[ev.Outcome] = ocId; } // ReSharper disable once LoopCanBeConvertedToQuery foreach (var pred in ev.Context) { if (predicateIndex.ContainsKey(pred)) { indexedContext.Add(predicateIndex[pred]); } } // drop events with no active features if (indexedContext.Count > 0) { var cons = new int[indexedContext.Count]; for (int ci = 0; ci < cons.Length; ci++) { cons[ci] = indexedContext[ci]; } eventsToCompare.Add(new ComparableEvent(ocId, cons)); } else { Monitor?.OnWarning($"Dropped event {ev.Outcome}:{ev.Context.ToDisplay()}"); } indexedContext.Clear(); } outcomeLabels = ToIndexedStringArray(map); predLabels = ToIndexedStringArray(predicateIndex); return(eventsToCompare); }
/// <summary> /// Returns the next object. Calling this method repeatedly until it returns, /// null will return each object from the underlying source exactly once. /// </summary> /// <returns> /// The next object or null to signal that the stream is exhausted. /// </returns> public ChunkSample Read() { AdSentence paragraph; while ((paragraph = adSentenceStream.Read()) != null) { if (End > -1 && Index >= End) // leave { return(null); } if (Start > -1 && Index < Start) { Index++; } else { var root = paragraph.Root; var sentence = new List <string>(); var tags = new List <string>(); var target = new List <string>(); ProcessRoot(root, sentence, tags, target); if (sentence.Count <= 0) { continue; } Index++; return(new ChunkSample(sentence.ToArray(), tags.ToArray(), target.ToArray())); } } return(null); }
/// <summary> /// Returns the next object. Calling this method repeatedly until it returns, /// null will return each object from the underlying source exactly once. /// </summary> /// <returns> /// The next object or null to signal that the stream is exhausted. /// </returns> public Event Read() { var eventString = objectStream.Read(); return(eventString != null ? CreateEvent(eventString, Monitor) : null); }
public static void PopulatePOSDictionary(IObjectStream <POSSample> samples, IMutableTagDictionary dictionary, bool caseSensitive, int cutoff) { var newEntries = new Dictionary <string, Dictionary <string, int> >(); POSSample sample; while ((sample = samples.Read()) != null) { for (int i = 0; i < sample.Sentence.Length; i++) { if (!StringPattern.Recognize(sample.Sentence[i]).ContainsDigit) { string word = caseSensitive ? sample.Sentence[i] : sample.Sentence[i].ToLowerInvariant(); if (!newEntries.ContainsKey(word)) { newEntries.Add(word, new Dictionary <string, int>()); } var dicTags = dictionary.GetTags(word); if (dicTags != null) { foreach (var tag in dicTags) { if (!newEntries[word].ContainsKey(tag)) { newEntries[word].Add(tag, cutoff); } } } if (!newEntries[word].ContainsKey(sample.Tags[i])) { newEntries[word].Add(sample.Tags[i], 1); } else { newEntries[word][sample.Tags[i]]++; } } } } foreach (var wordEntry in newEntries) { var tagsForWord = new List <string>(); foreach (var entry in wordEntry.Value) { if (entry.Value >= cutoff) { tagsForWord.Add(entry.Key); } } if (tagsForWord.Count > 0) { dictionary.Put(wordEntry.Key, tagsForWord.ToArray()); } } }
public T Read() { if (isPoisoned) { throw new InvalidOperationException(); } // skip training samples while (index % numberOfPartitions != testIndex) { sampleStream.Read(); index++; } index++; return(sampleStream.Read()); }
/// <summary> /// Reads all sample objects from the stream and evaluates each sample object with <see cref="M:ProcessSample"/> method. /// </summary> /// <param name="samples">The samples.</param> public void Evaluate(IObjectStream <T> samples) { T sample; while ((sample = samples.Read()) != null) { EvaluateSample(sample); } }
/// <summary> /// Returns the next object. Calling this method repeatedly until it returns , /// null will return each object from the underlying source exactly once. /// </summary> /// <returns> /// The next object or null to signal that the stream is exhausted. /// </returns> public T Read() { if (testSampleStream != null || isPoisoned) { throw new InvalidOperationException(); } // If the test element is reached skip over it to not include it in // the training data if (index % numberOfPartitions == testIndex) { sampleStream.Read(); index++; } index++; return(sampleStream.Read()); }
/// <summary> /// Returns the next object. Calling this method repeatedly until it returns, null will return each object from the underlying source exactly once. /// </summary> /// <returns> /// The next object or null to signal that the stream is exhausted. /// </returns> public virtual T Read() { var value = stream.Read(); if (value == null) { IsFinished = true; } return(value); }
/// <summary> /// Returns the next object. Calling this method repeatedly until it returns, /// null will return each object from the underlying source exactly once. /// </summary> /// <returns> /// The next object or null to signal that the stream is exhausted. /// </returns> public TokenSample Read() { var inputString = input.Read(); if (inputString != null) { var tokens = tokenizer.TokenizePos(inputString); return(new TokenSample(inputString, tokens)); } return(null); }
/// <summary> /// Returns the next <see cref="T:SentenceSample"/>. Calling this method repeatedly until it returns, /// null will return each object from the underlying source exactly once. /// </summary> /// <returns> /// The next object or null to signal that the stream is exhausted. /// </returns> public SentenceSample Read() { if (sentence == null) { sentence = adSentenceStream.Read(); UpdateMeta(); if (sentence == null) { return(null); } } var document = new StringBuilder(); var sentences = new List <Span>(); do { do { if (!isTitle || (isTitle && isIncludeTitles)) { if (HasPunctuation(sentence.Text)) { var start = document.Length; document.Append(sentence.Text); sentences.Add(new Span(start, document.Length)); document.Append(' '); } } sentence = adSentenceStream.Read(); UpdateMeta(); } while (isSamePara); } while (isSameText); return(new SentenceSample( document.Length > 0 ? document.ToString(0, document.Length - 1) : document.ToString(), sentences.ToArray() )); }
/// <summary> /// Returns the next object. Calling this method repeatedly until it returns, /// null will return each object from the underlying source exactly once. /// </summary> /// <returns> /// The next object or null to signal that the stream is exhausted. /// </returns> public POSSample Read() { callsCount++; AdSentence paragraph; while ((paragraph = adSentenceStream.Read()) != null) { var root = paragraph.Root; var sentence = new List <string>(); var tags = new List <string>(); var contractions = new List <string>(); var prop = new List <string>(); Process(root, sentence, tags, contractions, prop); if (sentence.Count != contractions.Count || sentence.Count != prop.Count) { throw new InvalidOperationException("The processed information must have the same length."); } if (additionalContext) { //String[][] ac = new String[2][sentence.size()]; var ac = new string[2][]; ac[0] = new string[sentence.Count]; ac[1] = new string[sentence.Count]; // line 0: contractions // line 1: props for (var i = 0; i < sentence.Count; i++) { if (contractions[i] != null) { ac[0][i] = contractions[i]; } if (prop[i] != null) { ac[1][i] = prop[i]; } } return(new POSSample(sentence.ToArray(), tags.ToArray(), ac)); } return(new POSSample(sentence.ToArray(), tags.ToArray())); } return(null); }
/// <summary> /// Builds the NGram dictionary with the given samples. /// </summary> /// <param name="samples">The samples.</param> /// <param name="cutoff">The cutoff.</param> /// <returns>The NGram dictionary.</returns> public static Dic BuildNGramDictionary(IObjectStream <POSSample> samples, int cutoff) { var model = new NGramModel(); POSSample sample; while ((sample = samples.Read()) != null) { if (sample.Sentence.Length > 0) { model.Add(new StringList(sample.Sentence), 1, 1); } } model.CutOff(cutoff, int.MaxValue); return(model.ToDictionary()); }
/// <summary> /// Returns the next <see cref="Sequence"/>. Calling this method repeatedly until it returns, /// null will return each object from the underlying source exactly once. /// </summary> /// <returns> /// The next <see cref="Sequence"/> or null to signal that the stream is exhausted. /// </returns> public Sequence Read() { var sample = objectStream.Read(); if (sample != null) { var events = new Event[sample.Sentence.Length]; for (var i = 0; i < events.Length; i++) { events[i] = new Event(sample.Tags[i], contextGenerator.GetContext(i, sample.Sentence, sample.Tags, null)); } return(new Sequence(events, sample)); } return(null); }
/// <summary> /// Returns the next <see cref="Sequence"/>. Calling this method repeatedly until it returns, /// null will return each object from the underlying source exactly once. /// </summary> /// <returns> /// The next object or null to signal that the stream is exhausted. /// </returns> public Sequence Read() { var sample = samples.Read(); if (sample != null) { var events = new Event[sample.Sentence.Count]; for (int i = 0, count = sample.Sentence.Count; i < count; i++) { events[i] = new Event( sample.Tags[i], // it is safe to pass the tags as previous tags because // the context generator does not look for non predicted tags contextGenerator.GetContext(i, sample.Sentence.ToArray(), sample.Tags.ToArray(), null)); } return(new Sequence(events, sample)); } return(null); }
/// <summary> /// Returns the next <see cref="T:Event"/>. Calling this method repeatedly until it returns, /// null will return each event from the underlying source exactly once. /// </summary> /// <returns> /// The next object or null to signal that the stream is exhausted. /// </returns> public Event Read() { if (events.MoveNext()) { return(events.Current); } T sample; while ((sample = samples.Read()) != null) { events = CreateEvents(sample); if (events != null && events.MoveNext()) { return(events.Current); } } return(null); }
/// <summary> /// Returns the next object. Calling this method repeatedly until it returns, /// null will return each object from the underlying source exactly once. /// </summary> /// <returns> /// The next object or null to signal that the stream is exhausted. /// </returns> public NameSample Read() { AdSentence paragraph; while ((paragraph = adSentenceStream.Read()) != null) { if (monitor != null) { monitor.Token.ThrowIfCancellationRequested(); } var root = paragraph.Root; var sentence = new List <string>(); var names = new List <Span>(); Process(root, sentence, names); return(new NameSample(sentence.ToArray(), names.ToArray(), true)); } return(null); }
public Sequence Read() { var sample = samples.Read(); if (sample == null) { return(null); } var events = new Event[sample.Length]; for (var i = 0; i < sample.Length; i++) { // it is safe to pass the tags as previous tags because // the context generator does not look for non predicted tags var context = contextGenerator.GetContext(i, sample.Tokens, sample.Tags, sample.Lemmas); events[i] = new Event(sample.Tags[i], context); } return(new Sequence(events, sample)); }
/// <summary> /// Creates a n-gram dictionary from the specified data stream using the specified head rule and specified cut-off. /// </summary> /// <param name="data">The data stream of parses.</param> /// <param name="rules">The head rules for the parses.</param> /// <param name="parameters">Can contain a cutoff, the minimum number of entries required for the n-gram to be saved as part of the dictionary.</param> /// <returns>A dictionary object.</returns> public static Dic BuildDictionary(IObjectStream<Parse> data, AbstractHeadRules rules, TrainingParameters parameters) { var cutoff = parameters.Get("dict", Parameters.Cutoff, 5); var dict = new NGramModel(); Parse p; while ((p = data.Read()) != null) { p.UpdateHeads(rules); var pWords = p.GetTagNodes(); var words = new string[pWords.Length]; //add all uni-grams for (var wi = 0; wi < words.Length; wi++) { words[wi] = pWords[wi].CoveredText; } dict.Add(new StringList(words), 1, 1); //add tri-grams and bi-grams for initial sequence var chunks = CollapsePunctuation(AbstractParserEventStream.GetInitialChunks(p), rules.PunctuationTags); var cWords = new string[chunks.Length]; for (var wi = 0; wi < cWords.Length; wi++) { cWords[wi] = chunks[wi].Head.CoveredText; } dict.Add(new StringList(cWords), 2, 3); //emulate reductions to produce additional n-grams var ci = 0; while (ci < chunks.Length) { /* if (chunks[ci].Parent == null) { chunks[ci].Show(); } */ if (LastChild(chunks[ci], chunks[ci].Parent, rules.PunctuationTags)) { //perform reduce var reduceStart = ci; while (reduceStart >= 0 && Equals(chunks[reduceStart].Parent, chunks[ci].Parent)) { reduceStart--; } reduceStart++; chunks = ParserEventStream.ReduceChunks(chunks, ref ci, chunks[ci].Parent); ci = reduceStart; if (chunks.Length != 0) { var window = new string[5]; var wi = 0; if (ci - 2 >= 0) window[wi++] = chunks[ci - 2].Head.CoveredText; if (ci - 1 >= 0) window[wi++] = chunks[ci - 1].Head.CoveredText; window[wi++] = chunks[ci].Head.CoveredText; if (ci + 1 < chunks.Length) window[wi++] = chunks[ci + 1].Head.CoveredText; if (ci + 2 < chunks.Length) window[wi++] = chunks[ci + 2].Head.CoveredText; if (wi < 5) { var subWindow = new string[wi]; for (var swi = 0; swi < wi; swi++) { subWindow[swi] = window[swi]; } window = subWindow; } if (window.Length >= 3) { dict.Add(new StringList(window), 2, 3); } else if (window.Length == 2) { dict.Add(new StringList(window), 2, 2); } } ci = reduceStart - 1; //ci will be incremented at end of loop } ci++; } } dict.CutOff(cutoff, int.MaxValue); return dict.ToDictionary(true); }
private List<ComparableEvent> Index( IObjectStream<Event> indexEventStream, Dictionary<string, int> predicateIndex) { var map = new Dictionary<string, int>(); var indexedContext = new List<int>(); var eventsToCompare = new List<ComparableEvent>(); int outcomeCount = 0; Event ev; while ((ev = indexEventStream.Read()) != null) { int ocID; if (Monitor != null && Monitor.Token.CanBeCanceled) Monitor.Token.ThrowIfCancellationRequested(); if (map.ContainsKey(ev.Outcome)) { ocID = map[ev.Outcome]; } else { ocID = outcomeCount++; map[ev.Outcome] = ocID; } // ReSharper disable once LoopCanBeConvertedToQuery foreach (var pred in ev.Context) { if (predicateIndex.ContainsKey(pred)) { indexedContext.Add(predicateIndex[pred]); } } // drop events with no active features if (indexedContext.Count > 0) { var cons = new int[indexedContext.Count]; for (int ci = 0; ci < cons.Length; ci++) { cons[ci] = indexedContext[ci]; } eventsToCompare.Add(new ComparableEvent(ocID, cons)); } else { if (Monitor != null) Monitor.OnWarning(string.Format("Dropped event {0}:{1}", ev.Outcome, ev.Context.ToDisplay())); } indexedContext.Clear(); } outcomeLabels = ToIndexedStringArray(map); predLabels = ToIndexedStringArray(predicateIndex); return eventsToCompare; }
public static void PopulatePOSDictionary(IObjectStream<POSSample> samples, IMutableTagDictionary dictionary, bool caseSensitive, int cutoff) { var newEntries = new Dictionary<string, Dictionary<string, int>>(); POSSample sample; while ((sample = samples.Read()) != null) { for (int i = 0; i < sample.Sentence.Length; i++) { if (!StringPattern.Recognize(sample.Sentence[i]).ContainsDigit) { string word = caseSensitive ? sample.Sentence[i] : sample.Sentence[i].ToLowerInvariant(); if (!newEntries.ContainsKey(word)) { newEntries.Add(word, new Dictionary<string, int>()); } var dicTags = dictionary.GetTags(word); if (dicTags != null) { foreach (var tag in dicTags) { if (!newEntries[word].ContainsKey(tag)) { newEntries[word].Add(tag, cutoff); } } } if (!newEntries[word].ContainsKey(sample.Tags[i])) { newEntries[word].Add(sample.Tags[i], 1); } else { newEntries[word][sample.Tags[i]]++; } } } } foreach (var wordEntry in newEntries) { var tagsForWord = (from entry in wordEntry.Value where entry.Value >= cutoff select entry.Key).ToList(); if (tagsForWord.Count > 0) dictionary.Put(wordEntry.Key, tagsForWord.ToArray()); } }
/// <summary> /// Builds the NGram dictionary with the given samples. /// </summary> /// <param name="samples">The samples.</param> /// <param name="cutoff">The cutoff.</param> /// <returns>The NGram dictionary.</returns> public static Dict BuildNGramDictionary(IObjectStream<POSSample> samples, int cutoff) { var model = new NGramModel(); POSSample sample; while ((sample = samples.Read()) != null) { if (sample.Sentence.Length > 0) { model.Add(new StringList(sample.Sentence), 1, 1); } } model.CutOff(cutoff, int.MaxValue); return model.ToDictionary(); }
/// <summary> /// Creates a n-gram dictionary from the specified data stream using the specified head rule and specified cut-off. /// </summary> /// <param name="data">The data stream of parses.</param> /// <param name="rules">The head rules for the parses.</param> /// <param name="parameters">Can contain a cutoff, the minimum number of entries required for the n-gram to be saved as part of the dictionary.</param> /// <returns>A dictionary object.</returns> public static Dic BuildDictionary(IObjectStream <Parse> data, AbstractHeadRules rules, TrainingParameters parameters) { var cutoff = parameters.Get("dict", Parameters.Cutoff, 5); var dict = new NGramModel(); Parse p; while ((p = data.Read()) != null) { p.UpdateHeads(rules); var pWords = p.GetTagNodes(); var words = new string[pWords.Length]; //add all uni-grams for (var wi = 0; wi < words.Length; wi++) { words[wi] = pWords[wi].CoveredText; } dict.Add(new StringList(words), 1, 1); //add tri-grams and bi-grams for initial sequence var chunks = CollapsePunctuation(AbstractParserEventStream.GetInitialChunks(p), rules.PunctuationTags); var cWords = new string[chunks.Length]; for (var wi = 0; wi < cWords.Length; wi++) { cWords[wi] = chunks[wi].Head.CoveredText; } dict.Add(new StringList(cWords), 2, 3); //emulate reductions to produce additional n-grams var ci = 0; while (ci < chunks.Length) { /* * if (chunks[ci].Parent == null) { * chunks[ci].Show(); * } */ if (LastChild(chunks[ci], chunks[ci].Parent, rules.PunctuationTags)) { //perform reduce var reduceStart = ci; while (reduceStart >= 0 && Equals(chunks[reduceStart].Parent, chunks[ci].Parent)) { reduceStart--; } reduceStart++; chunks = ParserEventStream.ReduceChunks(chunks, ref ci, chunks[ci].Parent); ci = reduceStart; if (chunks.Length != 0) { var window = new string[5]; var wi = 0; if (ci - 2 >= 0) { window[wi++] = chunks[ci - 2].Head.CoveredText; } if (ci - 1 >= 0) { window[wi++] = chunks[ci - 1].Head.CoveredText; } window[wi++] = chunks[ci].Head.CoveredText; if (ci + 1 < chunks.Length) { window[wi++] = chunks[ci + 1].Head.CoveredText; } if (ci + 2 < chunks.Length) { window[wi++] = chunks[ci + 2].Head.CoveredText; } if (wi < 5) { var subWindow = new string[wi]; for (var swi = 0; swi < wi; swi++) { subWindow[swi] = window[swi]; } window = subWindow; } if (window.Length >= 3) { dict.Add(new StringList(window), 2, 3); } else if (window.Length == 2) { dict.Add(new StringList(window), 2, 2); } } ci = reduceStart - 1; //ci will be incremented at end of loop } ci++; } } dict.CutOff(cutoff, int.MaxValue); return(dict.ToDictionary(true)); }
/// <summary> /// Returns the next object. Calling this method repeatedly until it returns, /// null will return each object from the underlying source exactly once. /// </summary> /// <returns> /// The next object or null to signal that the stream is exhausted. /// </returns> public NameSample Read() { var sentence = new List <string>(); var tags = new List <string>(); var isClearAdaptiveData = false; // Empty line indicates end of sentence string line; while ((line = lineStream.Read()) != null && !string.IsNullOrWhiteSpace(line)) { if (line.StartsWith("###MEDLINE:")) { isClearAdaptiveData = true; lineStream.Read(); continue; } if (line.Contains("ABSTRACT TRUNCATED")) { continue; } var fields = line.Split('\t'); if (fields.Length == 2) { sentence.Add(fields[0]); tags.Add(fields[1]); } else { throw new InvalidFormatException("Expected two fields per line in training data, got " + fields.Length + " for line '" + line + "'!"); } } if (sentence.Count > 0) { // convert name tags into spans var names = new List <Span>(); var beginIndex = -1; var endIndex = -1; for (var i = 0; i < tags.Count; i++) { var tag = tags[i]; if (tag.EndsWith("DNA") && (types & GENERATE_DNA_ENTITIES) == 0) { tag = "O"; } if (tag.EndsWith("protein") && (types & GENERATE_PROTEIN_ENTITIES) == 0) { tag = "O"; } if (tag.EndsWith("cell_type") && (types & GENERATE_CELLTYPE_ENTITIES) == 0) { tag = "O"; } if (tag.EndsWith("cell_line") && (types & GENERATE_CELLTYPE_ENTITIES) == 0) { tag = "O"; } if (tag.EndsWith("RNA") && (types & GENERATE_RNA_ENTITIES) == 0) { tag = "O"; } if (tag.StartsWith("B-")) { if (beginIndex != -1) { names.Add(new Span(beginIndex, endIndex, tags[beginIndex].Substring(2))); //beginIndex = -1; //endIndex = -1; } beginIndex = i; endIndex = i + 1; } else if (tag.StartsWith("I-")) { endIndex++; } else if (tag.Equals("O")) { if (beginIndex != -1) { names.Add(new Span(beginIndex, endIndex, tags[beginIndex].Substring(2))); beginIndex = -1; endIndex = -1; } } else { throw new IOException("Invalid tag: " + tag); } } // if one span remains, create it here if (beginIndex != -1) { names.Add(new Span(beginIndex, endIndex, tags[beginIndex].Substring(2))); } return(new NameSample(sentence.ToArray(), names.ToArray(), isClearAdaptiveData)); } if (line != null) { // Just filter out empty events, if two lines in a row are empty return(Read()); } // source stream is not returning anymore lines return(null); }
/// <summary> /// Returns the next object. Calling this method repeatedly until it returns, /// null will return each object from the underlying source exactly once. /// </summary> /// <returns> /// The next object or null to signal that the stream is exhausted. /// </returns> public NameSample Read() { var sentence = new List <string>(); var tags = new List <string>(); var ClearAdaptiveData = false; // Empty line indicates end of sentence string line; while ((line = lineStream.Read()) != null && !string.IsNullOrWhiteSpace(line)) { if (language == Language.Nl && line.StartsWith(DocStart)) { ClearAdaptiveData = true; continue; } var fields = line.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); if (fields.Length == 3) { sentence.Add(fields[0]); tags.Add(fields[2]); } else { throw new InvalidFormatException( string.Format("Expected three fields per line in training data, got {0} for line '{1}'!", fields.Length, line)); } } // Always clear adaptive data for spanish if (language == Language.Es) { ClearAdaptiveData = true; } if (sentence.Count > 0) { // convert name tags into spans var names = new List <Span>(); var beginIndex = -1; var endIndex = -1; for (var i = 0; i < tags.Count; i++) { var tag = tags[i]; if (tag.EndsWith("PER") && (types & Types.PersonEntities) == 0) { tag = "O"; } if (tag.EndsWith("ORG") && (types & Types.OrganizationEntities) == 0) { tag = "O"; } if (tag.EndsWith("LOC") && (types & Types.LocationEntities) == 0) { tag = "O"; } if (tag.EndsWith("MISC") && (types & Types.MiscEntities) == 0) { tag = "O"; } if (tag.StartsWith("B-")) { if (beginIndex != -1) { names.Add(Extract(beginIndex, endIndex, tags[beginIndex])); //beginIndex = -1; //endIndex = -1; } beginIndex = i; endIndex = i + 1; } else if (tag.StartsWith("I-")) { endIndex++; } else if (tag.Equals("O")) { if (beginIndex != -1) { names.Add(Extract(beginIndex, endIndex, tags[beginIndex])); beginIndex = -1; endIndex = -1; } } else { throw new InvalidFormatException("Invalid tag: " + tag); } } // if one span remains, create it here if (beginIndex != -1) { names.Add(Extract(beginIndex, endIndex, tags[beginIndex])); } return(new NameSample(sentence.ToArray(), names.ToArray(), ClearAdaptiveData)); } return(line != null?Read() : null); }
/// <summary> /// Returns the next <see cref="PtbNode"/> object. Calling this method repeatedly until it returns, /// null will return each object from the underlying source exactly once. /// </summary> /// <returns> /// The next <see cref="PtbNode"/> object or <c>null</c> to signal that the stream is exhausted. /// </returns> public PtbNode Read() { retry: if (monitor != null && monitor.Token.CanBeCanceled) { monitor.Token.ThrowIfCancellationRequested(); } PtbNode root = null; var pos = 0; // text position var stack = new Stack <PtbNode>(); var invalid = false; string line; while ((line = lineStream.Read()) != null) { line = line.TrimStart(' ', '\t', '\u00A0'); // \u00A0 = NO-BREAK SPACE if (invalid) { if (line.Trim().Length == 0) // end of sentence { goto retry; } for (var i = 0; i < line.Length; i++) { switch (line[i]) { case '#': if (i == 0) { goto next; // ignore comment } break; case '(': stack.Push(null); continue; case ')': stack.Pop(); if (stack.Count == 0) { goto done; } continue; default: continue; } } continue; // ignore invalid data } if (line.Length == 0) { if (root != null) { goto done; } continue; } for (var i = 0; i < line.Length; i++) { switch (line[i]) { case '#': if (i == 0) { goto next; // ignore comment, if the line starts with '#' } continue; case '(': var rest = line.Substring(i + 1); var type = resolver.GetType(rest, useFunctionTags); var token = resolver.GetToken(rest); if (type == null) { if (monitor != null) { monitor.OnWarning("Penn treebank node without type: " + line); } stack.Push(null); invalid = true; goto next; } /* skip a few chars to improve performance (if possible)... */ int skip; if (token != null && (skip = rest.IndexOf(')')) != -1) { i += skip; } var child = token != null ? new PtbNode { Type = type, Token = token, Span = new Span(pos, pos + token.Length) } : new PtbNode { Type = type }; if (token != null) { pos += token.Length + 1; } if (root == null) { root = child; } if (stack.Count > 0) { var parent = stack.Peek(); // check if the parent node is a gap if (parent != null) { parent.Children.Add(child); } else { // search for the parent node that is not a gap var array = stack.ToArray(); foreach (var p in array) { if (p == null) { continue; } p.Children.Add(child); break; } } } stack.Push(child); continue; case ')': var pop = stack.Pop(); // adjust span if (pop != null) { if (pop.HasChildren) { var s = GetStartPos(pop); var e = GetEndPos(pop); if (s.HasValue && e.HasValue) { pop.Span = new Span(s.Value, e.Value); } } if (pop.Span == null) { pop.Span = null; } } if (stack.Count == 0) { goto done; } continue; } } next: ; } done: // check if invalid. if (invalid || stack.Count != 0) { if (monitor != null) { monitor.OnWarning("A invalid Penn Treebank sentence was skipped."); } goto retry; } // End of stream if (root == null) { return(null); } var rs = GetStartPos(root); var re = GetEndPos(root); root.Span = new Span(rs.Value, re.Value); // if the stack is not empty, the sentence is incomplete/invalid. return(stack.Count == 0 ? root : null); }
/// <summary> /// Returns the next object. Calling this method repeatedly until it returns, /// null will return each object from the underlying source exactly once. /// </summary> /// <returns> /// The next object or null to signal that the stream is exhausted. /// </returns> public NameSample Read() { var sentence = new List <string>(); var tags = new List <string>(); var isClearAdaptiveData = false; // Empty line indicates end of sentence string line; while ((line = lineStream.Read()) != null && !string.IsNullOrWhiteSpace(line)) { if (line.StartsWith(DocStart)) { isClearAdaptiveData = true; line = lineStream.Read(); if (!string.IsNullOrEmpty(line)) { throw new InvalidFormatException("Empty line after -DOCSTART- not empty: '" + line + "'!"); } continue; } var fields = line.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); if (language == Language.En && fields.Length == 4) { // For English: WORD POS-TAG SC-TAG NE-TAG sentence.Add(fields[0]); tags.Add(fields[3]); } else if (language == Language.De && fields.Length == 5) { // For German: WORD LEMA-TAG POS-TAG SC-TAG NE-TAG sentence.Add(fields[0]); tags.Add(fields[4]); } else { throw new InvalidFormatException( $"Incorrect number of fields per line for language: '{line}'!"); } } if (sentence.Count > 0) { // convert name tags into spans var names = new List <Span>(); var beginIndex = -1; var endIndex = -1; for (var i = 0; i < tags.Count; i++) { var tag = tags[i]; if (tag.EndsWith("PER") && (types & Types.PersonEntities) == 0) { tag = "O"; } if (tag.EndsWith("ORG") && (types & Types.OrganizationEntities) == 0) { tag = "O"; } if (tag.EndsWith("LOC") && (types & Types.LocationEntities) == 0) { tag = "O"; } if (tag.EndsWith("MISC") && (types & Types.MiscEntities) == 0) { tag = "O"; } if (tag == "O") { if (beginIndex == -1) { continue; } names.Add(Extract(beginIndex, endIndex, tags[beginIndex])); beginIndex = -1; endIndex = -1; } else if (tag.StartsWith("B-")) { // B- prefix means we have two same entities next to each other if (beginIndex != -1) { names.Add(Extract(beginIndex, endIndex, tags[beginIndex])); } beginIndex = i; endIndex = i + 1; } else if (tag.StartsWith("I-")) { // I- starts or continues a current name entity if (beginIndex == -1) { beginIndex = i; endIndex = i + 1; } else if (!tag.EndsWith(tags[beginIndex].Substring(1))) { // we have a new tag type following a tagged word series // also may not have the same I- starting the previous! names.Add(Extract(beginIndex, endIndex, tags[beginIndex])); beginIndex = i; endIndex = i + 1; } else { endIndex++; } } else { throw new InvalidFormatException("Invalid tag: " + tag); } } // if one span remains, create it here if (beginIndex != -1) { names.Add(Extract(beginIndex, endIndex, tags[beginIndex])); } return(new NameSample(sentence.ToArray(), names.ToArray(), isClearAdaptiveData)); } if (line != null) { // Just filter out empty events, if two lines in a row are empty return(Read()); } // source stream is not returning anymore lines return(null); }