Exemple #1
0
        /// <summary>
        /// Returns the next object. Calling this method repeatedly until it returns,
        /// null will return each object from the underlying source exactly once.
        /// </summary>
        /// <returns>
        /// The next object or null to signal that the stream is exhausted.
        /// </returns>
        public NameSample Read()
        {
            AdSentence paragraph;

            if (monitor != null)
            {
                monitor.Token.ThrowIfCancellationRequested();
            }

            while ((paragraph = adSentenceStream.Read()) != null)
            {
                var clearData = false;

                var currentTextID = GetTextId(paragraph);
                if (currentTextID != textId)
                {
                    clearData = true;
                    textId    = currentTextID;
                }

                var root     = paragraph.Root;
                var sentence = new List <string>();
                var names    = new List <Span>();

                Process(root, sentence, names);

                return(new NameSample(sentence.ToArray(), names.ToArray(), clearData));
            }

            return(null);
        }
Exemple #2
0
        /// <summary>
        /// Returns the next object. Calling this method repeatedly until it returns,
        /// null will return each object from the underlying source exactly once.
        /// </summary>
        /// <returns>
        /// The next object or null to signal that the stream is exhausted.
        /// </returns>
        public Sequence Read()
        {
            var sample = psi.Read();

            if (sample != null)
            {
                var events = new Event[sample.Sentence.Length];

                for (int i = 0; i < sample.Sentence.Length; i++)
                {
                    // it is safe to pass the tags as previous tags because
                    // the context generator does not look for non predicted tags
                    var tags = seqCodec.Encode(sample.Names, sample.Sentence.Length);


                    var context = pcg.GetContext(
                        i,
                        sample.Sentence,
                        useOutcomes ? tags : null,
                        null);

                    events[i] = new Event(tags[i], context);
                }

                return(new Sequence(events, sample));
            }
            return(null);
        }
        private List <ComparableEvent> Index(
            IObjectStream <Event> indexEventStream,
            Dictionary <string, int> predicateIndex)
        {
            var map             = new Dictionary <string, int>();
            var indexedContext  = new List <int>();
            var eventsToCompare = new List <ComparableEvent>();
            int outcomeCount    = 0;

            Event ev;

            while ((ev = indexEventStream.Read()) != null)
            {
                int ocId;

                if (Monitor != null && Monitor.Token.CanBeCanceled)
                {
                    Monitor.Token.ThrowIfCancellationRequested();
                }

                if (map.ContainsKey(ev.Outcome))
                {
                    ocId = map[ev.Outcome];
                }
                else
                {
                    ocId            = outcomeCount++;
                    map[ev.Outcome] = ocId;
                }

                // ReSharper disable once LoopCanBeConvertedToQuery
                foreach (var pred in ev.Context)
                {
                    if (predicateIndex.ContainsKey(pred))
                    {
                        indexedContext.Add(predicateIndex[pred]);
                    }
                }

                // drop events with no active features
                if (indexedContext.Count > 0)
                {
                    var cons = new int[indexedContext.Count];
                    for (int ci = 0; ci < cons.Length; ci++)
                    {
                        cons[ci] = indexedContext[ci];
                    }
                    eventsToCompare.Add(new ComparableEvent(ocId, cons));
                }
                else
                {
                    Monitor?.OnWarning($"Dropped event {ev.Outcome}:{ev.Context.ToDisplay()}");
                }
                indexedContext.Clear();
            }

            outcomeLabels = ToIndexedStringArray(map);
            predLabels    = ToIndexedStringArray(predicateIndex);
            return(eventsToCompare);
        }
Exemple #4
0
        /// <summary>
        /// Returns the next object. Calling this method repeatedly until it returns,
        /// null will return each object from the underlying source exactly once.
        /// </summary>
        /// <returns>
        /// The next object or null to signal that the stream is exhausted.
        /// </returns>
        public ChunkSample Read()
        {
            AdSentence paragraph;

            while ((paragraph = adSentenceStream.Read()) != null)
            {
                if (End > -1 && Index >= End)   // leave
                {
                    return(null);
                }

                if (Start > -1 && Index < Start)
                {
                    Index++;
                }
                else
                {
                    var root     = paragraph.Root;
                    var sentence = new List <string>();
                    var tags     = new List <string>();
                    var target   = new List <string>();

                    ProcessRoot(root, sentence, tags, target);

                    if (sentence.Count <= 0)
                    {
                        continue;
                    }

                    Index++;
                    return(new ChunkSample(sentence.ToArray(), tags.ToArray(), target.ToArray()));
                }
            }
            return(null);
        }
Exemple #5
0
        /// <summary>
        /// Returns the next object. Calling this method repeatedly until it returns,
        /// null will return each object from the underlying source exactly once.
        /// </summary>
        /// <returns>
        /// The next object or null to signal that the stream is exhausted.
        /// </returns>
        public Event Read()
        {
            var eventString = objectStream.Read();

            return(eventString != null
                ? CreateEvent(eventString, Monitor)
                : null);
        }
Exemple #6
0
        public static void PopulatePOSDictionary(IObjectStream <POSSample> samples, IMutableTagDictionary dictionary, bool caseSensitive, int cutoff)
        {
            var       newEntries = new Dictionary <string, Dictionary <string, int> >();
            POSSample sample;

            while ((sample = samples.Read()) != null)
            {
                for (int i = 0; i < sample.Sentence.Length; i++)
                {
                    if (!StringPattern.Recognize(sample.Sentence[i]).ContainsDigit)
                    {
                        string word = caseSensitive ? sample.Sentence[i] : sample.Sentence[i].ToLowerInvariant();

                        if (!newEntries.ContainsKey(word))
                        {
                            newEntries.Add(word, new Dictionary <string, int>());
                        }

                        var dicTags = dictionary.GetTags(word);
                        if (dicTags != null)
                        {
                            foreach (var tag in dicTags)
                            {
                                if (!newEntries[word].ContainsKey(tag))
                                {
                                    newEntries[word].Add(tag, cutoff);
                                }
                            }
                        }

                        if (!newEntries[word].ContainsKey(sample.Tags[i]))
                        {
                            newEntries[word].Add(sample.Tags[i], 1);
                        }
                        else
                        {
                            newEntries[word][sample.Tags[i]]++;
                        }
                    }
                }
            }

            foreach (var wordEntry in newEntries)
            {
                var tagsForWord = new List <string>();
                foreach (var entry in wordEntry.Value)
                {
                    if (entry.Value >= cutoff)
                    {
                        tagsForWord.Add(entry.Key);
                    }
                }
                if (tagsForWord.Count > 0)
                {
                    dictionary.Put(wordEntry.Key, tagsForWord.ToArray());
                }
            }
        }
            public T Read()
            {
                if (isPoisoned)
                {
                    throw new InvalidOperationException();
                }

                // skip training samples
                while (index % numberOfPartitions != testIndex)
                {
                    sampleStream.Read();
                    index++;
                }

                index++;

                return(sampleStream.Read());
            }
Exemple #8
0
        /// <summary>
        /// Reads all sample objects from the stream and evaluates each sample object with <see cref="M:ProcessSample"/> method.
        /// </summary>
        /// <param name="samples">The samples.</param>
        public void Evaluate(IObjectStream <T> samples)
        {
            T sample;

            while ((sample = samples.Read()) != null)
            {
                EvaluateSample(sample);
            }
        }
            /// <summary>
            /// Returns the next object. Calling this method repeatedly until it returns ,
            /// null will return each object from the underlying source exactly once.
            /// </summary>
            /// <returns>
            /// The next object or null to signal that the stream is exhausted.
            /// </returns>
            public T Read()
            {
                if (testSampleStream != null || isPoisoned)
                {
                    throw new InvalidOperationException();
                }

                // If the test element is reached skip over it to not include it in
                // the training data
                if (index % numberOfPartitions == testIndex)
                {
                    sampleStream.Read();
                    index++;
                }

                index++;

                return(sampleStream.Read());
            }
Exemple #10
0
        /// <summary>
        /// Returns the next object. Calling this method repeatedly until it returns, null will return each object from the underlying source exactly once.
        /// </summary>
        /// <returns>
        /// The next object or null to signal that the stream is exhausted.
        /// </returns>
        public virtual T Read()
        {
            var value = stream.Read();

            if (value == null)
            {
                IsFinished = true;
            }
            return(value);
        }
Exemple #11
0
        /// <summary>
        /// Returns the next object. Calling this method repeatedly until it returns,
        /// null will return each object from the underlying source exactly once.
        /// </summary>
        /// <returns>
        /// The next object or null to signal that the stream is exhausted.
        /// </returns>
        public TokenSample Read()
        {
            var inputString = input.Read();

            if (inputString != null)
            {
                var tokens = tokenizer.TokenizePos(inputString);
                return(new TokenSample(inputString, tokens));
            }

            return(null);
        }
Exemple #12
0
        /// <summary>
        /// Returns the next <see cref="T:SentenceSample"/>. Calling this method repeatedly until it returns,
        /// null will return each object from the underlying source exactly once.
        /// </summary>
        /// <returns>
        /// The next object or null to signal that the stream is exhausted.
        /// </returns>
        public SentenceSample Read()
        {
            if (sentence == null)
            {
                sentence = adSentenceStream.Read();
                UpdateMeta();
                if (sentence == null)
                {
                    return(null);
                }
            }

            var document  = new StringBuilder();
            var sentences = new List <Span>();

            do
            {
                do
                {
                    if (!isTitle || (isTitle && isIncludeTitles))
                    {
                        if (HasPunctuation(sentence.Text))
                        {
                            var start = document.Length;
                            document.Append(sentence.Text);
                            sentences.Add(new Span(start, document.Length));
                            document.Append(' ');
                        }
                    }
                    sentence = adSentenceStream.Read();
                    UpdateMeta();
                } while (isSamePara);
            } while (isSameText);

            return(new SentenceSample(
                       document.Length > 0 ? document.ToString(0, document.Length - 1) : document.ToString(),
                       sentences.ToArray()
                       ));
        }
Exemple #13
0
        /// <summary>
        /// Returns the next object. Calling this method repeatedly until it returns,
        /// null will return each object from the underlying source exactly once.
        /// </summary>
        /// <returns>
        /// The next object or null to signal that the stream is exhausted.
        /// </returns>
        public POSSample Read()
        {
            callsCount++;

            AdSentence paragraph;

            while ((paragraph = adSentenceStream.Read()) != null)
            {
                var root         = paragraph.Root;
                var sentence     = new List <string>();
                var tags         = new List <string>();
                var contractions = new List <string>();
                var prop         = new List <string>();
                Process(root, sentence, tags, contractions, prop);

                if (sentence.Count != contractions.Count || sentence.Count != prop.Count)
                {
                    throw new InvalidOperationException("The processed information must have the same length.");
                }

                if (additionalContext)
                {
                    //String[][] ac = new String[2][sentence.size()];

                    var ac = new string[2][];

                    ac[0] = new string[sentence.Count];
                    ac[1] = new string[sentence.Count];

                    // line 0: contractions
                    // line 1: props
                    for (var i = 0; i < sentence.Count; i++)
                    {
                        if (contractions[i] != null)
                        {
                            ac[0][i] = contractions[i];
                        }
                        if (prop[i] != null)
                        {
                            ac[1][i] = prop[i];
                        }
                    }

                    return(new POSSample(sentence.ToArray(), tags.ToArray(), ac));
                }
                return(new POSSample(sentence.ToArray(), tags.ToArray()));
            }
            return(null);
        }
Exemple #14
0
        /// <summary>
        /// Builds the NGram dictionary with the given samples.
        /// </summary>
        /// <param name="samples">The samples.</param>
        /// <param name="cutoff">The cutoff.</param>
        /// <returns>The NGram dictionary.</returns>
        public static Dic BuildNGramDictionary(IObjectStream <POSSample> samples, int cutoff)
        {
            var       model = new NGramModel();
            POSSample sample;

            while ((sample = samples.Read()) != null)
            {
                if (sample.Sentence.Length > 0)
                {
                    model.Add(new StringList(sample.Sentence), 1, 1);
                }
            }
            model.CutOff(cutoff, int.MaxValue);

            return(model.ToDictionary());
        }
        /// <summary>
        /// Returns the next <see cref="Sequence"/>. Calling this method repeatedly until it returns,
        /// null will return each object from the underlying source exactly once.
        /// </summary>
        /// <returns>
        /// The next <see cref="Sequence"/> or null to signal that the stream is exhausted.
        /// </returns>
        public Sequence Read()
        {
            var sample = objectStream.Read();

            if (sample != null)
            {
                var events = new Event[sample.Sentence.Length];
                for (var i = 0; i < events.Length; i++)
                {
                    events[i] = new Event(sample.Tags[i],
                                          contextGenerator.GetContext(i, sample.Sentence, sample.Tags, null));
                }
                return(new Sequence(events, sample));
            }

            return(null);
        }
        /// <summary>
        /// Returns the next <see cref="Sequence"/>. Calling this method repeatedly until it returns,
        /// null will return each object from the underlying source exactly once.
        /// </summary>
        /// <returns>
        /// The next object or null to signal that the stream is exhausted.
        /// </returns>
        public Sequence Read()
        {
            var sample = samples.Read();

            if (sample != null)
            {
                var events = new Event[sample.Sentence.Count];
                for (int i = 0, count = sample.Sentence.Count; i < count; i++)
                {
                    events[i] = new Event(
                        sample.Tags[i],
                        // it is safe to pass the tags as previous tags because
                        // the context generator does not look for non predicted tags
                        contextGenerator.GetContext(i, sample.Sentence.ToArray(), sample.Tags.ToArray(), null));
                }
                return(new Sequence(events, sample));
            }

            return(null);
        }
Exemple #17
0
        /// <summary>
        /// Returns the next <see cref="T:Event"/>. Calling this method repeatedly until it returns,
        /// null will return each event from the underlying source exactly once.
        /// </summary>
        /// <returns>
        /// The next object or null to signal that the stream is exhausted.
        /// </returns>
        public Event Read()
        {
            if (events.MoveNext())
            {
                return(events.Current);
            }

            T sample;

            while ((sample = samples.Read()) != null)
            {
                events = CreateEvents(sample);

                if (events != null && events.MoveNext())
                {
                    return(events.Current);
                }
            }

            return(null);
        }
        /// <summary>
        /// Returns the next object. Calling this method repeatedly until it returns,
        /// null will return each object from the underlying source exactly once.
        /// </summary>
        /// <returns>
        /// The next object or null to signal that the stream is exhausted.
        /// </returns>
        public NameSample Read()
        {
            AdSentence paragraph;

            while ((paragraph = adSentenceStream.Read()) != null)
            {
                if (monitor != null)
                {
                    monitor.Token.ThrowIfCancellationRequested();
                }

                var root     = paragraph.Root;
                var sentence = new List <string>();
                var names    = new List <Span>();

                Process(root, sentence, names);

                return(new NameSample(sentence.ToArray(), names.ToArray(), true));
            }
            return(null);
        }
Exemple #19
0
        public Sequence Read()
        {
            var sample = samples.Read();

            if (sample == null)
            {
                return(null);
            }

            var events = new Event[sample.Length];

            for (var i = 0; i < sample.Length; i++)
            {
                // it is safe to pass the tags as previous tags because
                // the context generator does not look for non predicted tags
                var context = contextGenerator.GetContext(i, sample.Tokens, sample.Tags, sample.Lemmas);

                events[i] = new Event(sample.Tags[i], context);
            }
            return(new Sequence(events, sample));
        }
        /// <summary>
        /// Creates a n-gram dictionary from the specified data stream using the specified head rule and specified cut-off.
        /// </summary>
        /// <param name="data">The data stream of parses.</param>
        /// <param name="rules">The head rules for the parses.</param>
        /// <param name="parameters">Can contain a cutoff, the minimum number of entries required for the n-gram to be saved as part of the dictionary.</param>
        /// <returns>A dictionary object.</returns>
        public static Dic BuildDictionary(IObjectStream<Parse> data, AbstractHeadRules rules, TrainingParameters parameters) {
            var cutoff = parameters.Get("dict", Parameters.Cutoff, 5);
            var dict = new NGramModel();

            Parse p;
            while ((p = data.Read()) != null) {
                p.UpdateHeads(rules);
                var pWords = p.GetTagNodes();
                var words = new string[pWords.Length];
                //add all uni-grams
                for (var wi = 0; wi < words.Length; wi++) {
                    words[wi] = pWords[wi].CoveredText;
                }

                dict.Add(new StringList(words), 1, 1);
                //add tri-grams and bi-grams for initial sequence
                var chunks = CollapsePunctuation(AbstractParserEventStream.GetInitialChunks(p), rules.PunctuationTags);
                var cWords = new string[chunks.Length];
                for (var wi = 0; wi < cWords.Length; wi++) {
                    cWords[wi] = chunks[wi].Head.CoveredText;
                }
                dict.Add(new StringList(cWords), 2, 3);

                //emulate reductions to produce additional n-grams
                var ci = 0;
                while (ci < chunks.Length) {
                    /*
                    if (chunks[ci].Parent == null) {
                        chunks[ci].Show();
                    } */
                    if (LastChild(chunks[ci], chunks[ci].Parent, rules.PunctuationTags)) {
                        //perform reduce
                        var reduceStart = ci;
                        while (reduceStart >= 0 && Equals(chunks[reduceStart].Parent, chunks[ci].Parent)) {
                            reduceStart--;
                        }
                        reduceStart++;
                        chunks = ParserEventStream.ReduceChunks(chunks, ref ci, chunks[ci].Parent);
                        ci = reduceStart;
                        if (chunks.Length != 0) {
                            var window = new string[5];
                            var wi = 0;
                            if (ci - 2 >= 0) window[wi++] = chunks[ci - 2].Head.CoveredText;
                            if (ci - 1 >= 0) window[wi++] = chunks[ci - 1].Head.CoveredText;
                            window[wi++] = chunks[ci].Head.CoveredText;
                            if (ci + 1 < chunks.Length) window[wi++] = chunks[ci + 1].Head.CoveredText;
                            if (ci + 2 < chunks.Length) window[wi++] = chunks[ci + 2].Head.CoveredText;
                            if (wi < 5) {
                                var subWindow = new string[wi];
                                for (var swi = 0; swi < wi; swi++) {
                                    subWindow[swi] = window[swi];
                                }
                                window = subWindow;
                            }
                            if (window.Length >= 3) {
                                dict.Add(new StringList(window), 2, 3);
                            } else if (window.Length == 2) {
                                dict.Add(new StringList(window), 2, 2);
                            }
                        }
                        ci = reduceStart - 1; //ci will be incremented at end of loop
                    }
                    ci++;
                }
            }
            dict.CutOff(cutoff, int.MaxValue);
            return dict.ToDictionary(true);
        }
Exemple #21
0
        private List<ComparableEvent> Index(
            IObjectStream<Event> indexEventStream,
            Dictionary<string, int> predicateIndex) {

            var map = new Dictionary<string, int>();
            var indexedContext = new List<int>();
            var eventsToCompare = new List<ComparableEvent>();
            int outcomeCount = 0;

            Event ev;
            while ((ev = indexEventStream.Read()) != null) {
                int ocID;

                if (Monitor != null && Monitor.Token.CanBeCanceled)
                    Monitor.Token.ThrowIfCancellationRequested();

                if (map.ContainsKey(ev.Outcome)) {
                    ocID = map[ev.Outcome];
                } else {
                    ocID = outcomeCount++;
                    map[ev.Outcome] = ocID;
                }

                // ReSharper disable once LoopCanBeConvertedToQuery
                foreach (var pred in ev.Context) {
                    if (predicateIndex.ContainsKey(pred)) {
                        indexedContext.Add(predicateIndex[pred]);
                    }
                }

                // drop events with no active features
                if (indexedContext.Count > 0) {
                    var cons = new int[indexedContext.Count];
                    for (int ci = 0; ci < cons.Length; ci++) {
                        cons[ci] = indexedContext[ci];
                    }
                    eventsToCompare.Add(new ComparableEvent(ocID, cons));
                } else {
                    if (Monitor != null)
                        Monitor.OnWarning(string.Format("Dropped event {0}:{1}", ev.Outcome, ev.Context.ToDisplay()));
                }
                indexedContext.Clear();
            }

            outcomeLabels = ToIndexedStringArray(map);
            predLabels = ToIndexedStringArray(predicateIndex);
            return eventsToCompare;
        }
Exemple #22
0
        public static void PopulatePOSDictionary(IObjectStream<POSSample> samples, IMutableTagDictionary dictionary, bool caseSensitive, int cutoff) {

            var newEntries = new Dictionary<string, Dictionary<string, int>>();
            POSSample sample;
            while ((sample = samples.Read()) != null) {

                for (int i = 0; i < sample.Sentence.Length; i++) {
                    if (!StringPattern.Recognize(sample.Sentence[i]).ContainsDigit) {
                        string word = caseSensitive ? sample.Sentence[i] : sample.Sentence[i].ToLowerInvariant();

                        if (!newEntries.ContainsKey(word)) {
                            newEntries.Add(word, new Dictionary<string, int>());
                        }

                        var dicTags = dictionary.GetTags(word);
                        if (dicTags != null) {
                            foreach (var tag in dicTags) {
                                if (!newEntries[word].ContainsKey(tag)) {
                                    newEntries[word].Add(tag, cutoff);
                                }
                            }
                        }

                        if (!newEntries[word].ContainsKey(sample.Tags[i])) {
                            newEntries[word].Add(sample.Tags[i], 1);
                        } else {
                            newEntries[word][sample.Tags[i]]++;
                        }
                    }
                }
            }

            foreach (var wordEntry in newEntries) {
                var tagsForWord = (from entry in wordEntry.Value where entry.Value >= cutoff select entry.Key).ToList();
                if (tagsForWord.Count > 0)
                    dictionary.Put(wordEntry.Key, tagsForWord.ToArray());
                
            }
        }
Exemple #23
0
        /// <summary>
        /// Builds the NGram dictionary with the given samples.
        /// </summary>
        /// <param name="samples">The samples.</param>
        /// <param name="cutoff">The cutoff.</param>
        /// <returns>The NGram dictionary.</returns>
        public static Dict BuildNGramDictionary(IObjectStream<POSSample> samples, int cutoff) {

            var model = new NGramModel();
            POSSample sample;

            while ((sample = samples.Read()) != null) {

                if (sample.Sentence.Length > 0) {
                    model.Add(new StringList(sample.Sentence), 1, 1);
                }

            }
            model.CutOff(cutoff, int.MaxValue);

            return model.ToDictionary();
        }
Exemple #24
0
        /// <summary>
        /// Creates a n-gram dictionary from the specified data stream using the specified head rule and specified cut-off.
        /// </summary>
        /// <param name="data">The data stream of parses.</param>
        /// <param name="rules">The head rules for the parses.</param>
        /// <param name="parameters">Can contain a cutoff, the minimum number of entries required for the n-gram to be saved as part of the dictionary.</param>
        /// <returns>A dictionary object.</returns>
        public static Dic BuildDictionary(IObjectStream <Parse> data, AbstractHeadRules rules, TrainingParameters parameters)
        {
            var cutoff = parameters.Get("dict", Parameters.Cutoff, 5);
            var dict   = new NGramModel();

            Parse p;

            while ((p = data.Read()) != null)
            {
                p.UpdateHeads(rules);
                var pWords = p.GetTagNodes();
                var words  = new string[pWords.Length];
                //add all uni-grams
                for (var wi = 0; wi < words.Length; wi++)
                {
                    words[wi] = pWords[wi].CoveredText;
                }

                dict.Add(new StringList(words), 1, 1);
                //add tri-grams and bi-grams for initial sequence
                var chunks = CollapsePunctuation(AbstractParserEventStream.GetInitialChunks(p), rules.PunctuationTags);
                var cWords = new string[chunks.Length];
                for (var wi = 0; wi < cWords.Length; wi++)
                {
                    cWords[wi] = chunks[wi].Head.CoveredText;
                }
                dict.Add(new StringList(cWords), 2, 3);

                //emulate reductions to produce additional n-grams
                var ci = 0;
                while (ci < chunks.Length)
                {
                    /*
                     * if (chunks[ci].Parent == null) {
                     *  chunks[ci].Show();
                     * } */
                    if (LastChild(chunks[ci], chunks[ci].Parent, rules.PunctuationTags))
                    {
                        //perform reduce
                        var reduceStart = ci;
                        while (reduceStart >= 0 && Equals(chunks[reduceStart].Parent, chunks[ci].Parent))
                        {
                            reduceStart--;
                        }
                        reduceStart++;
                        chunks = ParserEventStream.ReduceChunks(chunks, ref ci, chunks[ci].Parent);
                        ci     = reduceStart;
                        if (chunks.Length != 0)
                        {
                            var window = new string[5];
                            var wi     = 0;
                            if (ci - 2 >= 0)
                            {
                                window[wi++] = chunks[ci - 2].Head.CoveredText;
                            }
                            if (ci - 1 >= 0)
                            {
                                window[wi++] = chunks[ci - 1].Head.CoveredText;
                            }
                            window[wi++] = chunks[ci].Head.CoveredText;
                            if (ci + 1 < chunks.Length)
                            {
                                window[wi++] = chunks[ci + 1].Head.CoveredText;
                            }
                            if (ci + 2 < chunks.Length)
                            {
                                window[wi++] = chunks[ci + 2].Head.CoveredText;
                            }
                            if (wi < 5)
                            {
                                var subWindow = new string[wi];
                                for (var swi = 0; swi < wi; swi++)
                                {
                                    subWindow[swi] = window[swi];
                                }
                                window = subWindow;
                            }
                            if (window.Length >= 3)
                            {
                                dict.Add(new StringList(window), 2, 3);
                            }
                            else if (window.Length == 2)
                            {
                                dict.Add(new StringList(window), 2, 2);
                            }
                        }
                        ci = reduceStart - 1; //ci will be incremented at end of loop
                    }
                    ci++;
                }
            }
            dict.CutOff(cutoff, int.MaxValue);
            return(dict.ToDictionary(true));
        }
        /// <summary>
        /// Returns the next object. Calling this method repeatedly until it returns,
        /// null will return each object from the underlying source exactly once.
        /// </summary>
        /// <returns>
        /// The next object or null to signal that the stream is exhausted.
        /// </returns>
        public NameSample Read()
        {
            var sentence = new List <string>();
            var tags     = new List <string>();

            var isClearAdaptiveData = false;

            // Empty line indicates end of sentence

            string line;

            while ((line = lineStream.Read()) != null && !string.IsNullOrWhiteSpace(line))
            {
                if (line.StartsWith("###MEDLINE:"))
                {
                    isClearAdaptiveData = true;
                    lineStream.Read();
                    continue;
                }

                if (line.Contains("ABSTRACT TRUNCATED"))
                {
                    continue;
                }

                var fields = line.Split('\t');

                if (fields.Length == 2)
                {
                    sentence.Add(fields[0]);
                    tags.Add(fields[1]);
                }
                else
                {
                    throw new InvalidFormatException("Expected two fields per line in training data, got " +
                                                     fields.Length + " for line '" + line + "'!");
                }
            }

            if (sentence.Count > 0)
            {
                // convert name tags into spans
                var names = new List <Span>();

                var beginIndex = -1;
                var endIndex   = -1;
                for (var i = 0; i < tags.Count; i++)
                {
                    var tag = tags[i];

                    if (tag.EndsWith("DNA") && (types & GENERATE_DNA_ENTITIES) == 0)
                    {
                        tag = "O";
                    }

                    if (tag.EndsWith("protein") && (types & GENERATE_PROTEIN_ENTITIES) == 0)
                    {
                        tag = "O";
                    }

                    if (tag.EndsWith("cell_type") && (types & GENERATE_CELLTYPE_ENTITIES) == 0)
                    {
                        tag = "O";
                    }

                    if (tag.EndsWith("cell_line") && (types & GENERATE_CELLTYPE_ENTITIES) == 0)
                    {
                        tag = "O";
                    }
                    if (tag.EndsWith("RNA") && (types & GENERATE_RNA_ENTITIES) == 0)
                    {
                        tag = "O";
                    }

                    if (tag.StartsWith("B-"))
                    {
                        if (beginIndex != -1)
                        {
                            names.Add(new Span(beginIndex, endIndex, tags[beginIndex].Substring(2)));
                            //beginIndex = -1;
                            //endIndex = -1;
                        }

                        beginIndex = i;
                        endIndex   = i + 1;
                    }
                    else if (tag.StartsWith("I-"))
                    {
                        endIndex++;
                    }
                    else if (tag.Equals("O"))
                    {
                        if (beginIndex != -1)
                        {
                            names.Add(new Span(beginIndex, endIndex, tags[beginIndex].Substring(2)));
                            beginIndex = -1;
                            endIndex   = -1;
                        }
                    }
                    else
                    {
                        throw new IOException("Invalid tag: " + tag);
                    }
                }

                // if one span remains, create it here
                if (beginIndex != -1)
                {
                    names.Add(new Span(beginIndex, endIndex, tags[beginIndex].Substring(2)));
                }

                return(new NameSample(sentence.ToArray(), names.ToArray(), isClearAdaptiveData));
            }
            if (line != null)
            {
                // Just filter out empty events, if two lines in a row are empty
                return(Read());
            }
            // source stream is not returning anymore lines
            return(null);
        }
Exemple #26
0
        /// <summary>
        /// Returns the next object. Calling this method repeatedly until it returns,
        /// null will return each object from the underlying source exactly once.
        /// </summary>
        /// <returns>
        /// The next object or null to signal that the stream is exhausted.
        /// </returns>
        public NameSample Read()
        {
            var sentence = new List <string>();
            var tags     = new List <string>();

            var ClearAdaptiveData = false;

            // Empty line indicates end of sentence
            string line;

            while ((line = lineStream.Read()) != null && !string.IsNullOrWhiteSpace(line))
            {
                if (language == Language.Nl && line.StartsWith(DocStart))
                {
                    ClearAdaptiveData = true;
                    continue;
                }

                var fields = line.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);

                if (fields.Length == 3)
                {
                    sentence.Add(fields[0]);
                    tags.Add(fields[2]);
                }
                else
                {
                    throw new InvalidFormatException(
                              string.Format("Expected three fields per line in training data, got {0} for line '{1}'!",
                                            fields.Length, line));
                }
            }

            // Always clear adaptive data for spanish
            if (language == Language.Es)
            {
                ClearAdaptiveData = true;
            }

            if (sentence.Count > 0)
            {
                // convert name tags into spans
                var names = new List <Span>();

                var beginIndex = -1;
                var endIndex   = -1;
                for (var i = 0; i < tags.Count; i++)
                {
                    var tag = tags[i];

                    if (tag.EndsWith("PER") && (types & Types.PersonEntities) == 0)
                    {
                        tag = "O";
                    }

                    if (tag.EndsWith("ORG") && (types & Types.OrganizationEntities) == 0)
                    {
                        tag = "O";
                    }

                    if (tag.EndsWith("LOC") && (types & Types.LocationEntities) == 0)
                    {
                        tag = "O";
                    }

                    if (tag.EndsWith("MISC") && (types & Types.MiscEntities) == 0)
                    {
                        tag = "O";
                    }

                    if (tag.StartsWith("B-"))
                    {
                        if (beginIndex != -1)
                        {
                            names.Add(Extract(beginIndex, endIndex, tags[beginIndex]));
                            //beginIndex = -1;
                            //endIndex = -1;
                        }

                        beginIndex = i;
                        endIndex   = i + 1;
                    }
                    else if (tag.StartsWith("I-"))
                    {
                        endIndex++;
                    }
                    else if (tag.Equals("O"))
                    {
                        if (beginIndex != -1)
                        {
                            names.Add(Extract(beginIndex, endIndex, tags[beginIndex]));
                            beginIndex = -1;
                            endIndex   = -1;
                        }
                    }
                    else
                    {
                        throw new InvalidFormatException("Invalid tag: " + tag);
                    }
                }

                // if one span remains, create it here
                if (beginIndex != -1)
                {
                    names.Add(Extract(beginIndex, endIndex, tags[beginIndex]));
                }

                return(new NameSample(sentence.ToArray(), names.ToArray(), ClearAdaptiveData));
            }

            return(line != null?Read() : null);
        }
Exemple #27
0
        /// <summary>
        /// Returns the next <see cref="PtbNode"/> object. Calling this method repeatedly until it returns,
        /// null will return each object from the underlying source exactly once.
        /// </summary>
        /// <returns>
        /// The next <see cref="PtbNode"/> object or <c>null</c> to signal that the stream is exhausted.
        /// </returns>
        public PtbNode Read()
        {
retry:

            if (monitor != null && monitor.Token.CanBeCanceled)
            {
                monitor.Token.ThrowIfCancellationRequested();
            }

            PtbNode root = null;

            var    pos     = 0; // text position
            var    stack   = new Stack <PtbNode>();
            var    invalid = false;
            string line;

            while ((line = lineStream.Read()) != null)
            {
                line = line.TrimStart(' ', '\t', '\u00A0'); // \u00A0 = NO-BREAK SPACE

                if (invalid)
                {
                    if (line.Trim().Length == 0) // end of sentence
                    {
                        goto retry;
                    }

                    for (var i = 0; i < line.Length; i++)
                    {
                        switch (line[i])
                        {
                        case '#':
                            if (i == 0)
                            {
                                goto next;     // ignore comment
                            }
                            break;

                        case '(':
                            stack.Push(null);
                            continue;

                        case ')':
                            stack.Pop();

                            if (stack.Count == 0)
                            {
                                goto done;
                            }

                            continue;

                        default:
                            continue;
                        }
                    }

                    continue; // ignore invalid data
                }

                if (line.Length == 0)
                {
                    if (root != null)
                    {
                        goto done;
                    }

                    continue;
                }


                for (var i = 0; i < line.Length; i++)
                {
                    switch (line[i])
                    {
                    case '#':
                        if (i == 0)
                        {
                            goto next;     // ignore comment, if the line starts with '#'
                        }
                        continue;

                    case '(':
                        var rest  = line.Substring(i + 1);
                        var type  = resolver.GetType(rest, useFunctionTags);
                        var token = resolver.GetToken(rest);

                        if (type == null)
                        {
                            if (monitor != null)
                            {
                                monitor.OnWarning("Penn treebank node without type: " + line);
                            }

                            stack.Push(null);

                            invalid = true;
                            goto next;
                        }

                        /* skip a few chars to improve performance (if possible)... */
                        int skip;
                        if (token != null && (skip = rest.IndexOf(')')) != -1)
                        {
                            i += skip;
                        }

                        var child = token != null
                                ? new PtbNode {
                            Type = type, Token = token, Span = new Span(pos, pos + token.Length)
                        }
                                : new PtbNode {
                            Type = type
                        };

                        if (token != null)
                        {
                            pos += token.Length + 1;
                        }

                        if (root == null)
                        {
                            root = child;
                        }

                        if (stack.Count > 0)
                        {
                            var parent = stack.Peek();
                            // check if the parent node is a gap
                            if (parent != null)
                            {
                                parent.Children.Add(child);
                            }
                            else
                            {
                                // search for the parent node that is not a gap
                                var array = stack.ToArray();
                                foreach (var p in array)
                                {
                                    if (p == null)
                                    {
                                        continue;
                                    }
                                    p.Children.Add(child);
                                    break;
                                }
                            }
                        }

                        stack.Push(child);

                        continue;

                    case ')':
                        var pop = stack.Pop();

                        // adjust span
                        if (pop != null)
                        {
                            if (pop.HasChildren)
                            {
                                var s = GetStartPos(pop);
                                var e = GetEndPos(pop);
                                if (s.HasValue && e.HasValue)
                                {
                                    pop.Span = new Span(s.Value, e.Value);
                                }
                            }

                            if (pop.Span == null)
                            {
                                pop.Span = null;
                            }
                        }



                        if (stack.Count == 0)
                        {
                            goto done;
                        }

                        continue;
                    }
                }
next:
                ;
            }

done:

            // check if invalid.
            if (invalid || stack.Count != 0)
            {
                if (monitor != null)
                {
                    monitor.OnWarning("A invalid Penn Treebank sentence was skipped.");
                }

                goto retry;
            }

            // End of stream
            if (root == null)
            {
                return(null);
            }

            var rs = GetStartPos(root);
            var re = GetEndPos(root);

            root.Span = new Span(rs.Value, re.Value);

            // if the stack is not empty, the sentence is incomplete/invalid.
            return(stack.Count == 0
                ? root
                : null);
        }
        /// <summary>
        /// Returns the next object. Calling this method repeatedly until it returns,
        /// null will return each object from the underlying source exactly once.
        /// </summary>
        /// <returns>
        /// The next object or null to signal that the stream is exhausted.
        /// </returns>
        public NameSample Read()
        {
            var sentence = new List <string>();
            var tags     = new List <string>();

            var isClearAdaptiveData = false;

            // Empty line indicates end of sentence

            string line;

            while ((line = lineStream.Read()) != null && !string.IsNullOrWhiteSpace(line))
            {
                if (line.StartsWith(DocStart))
                {
                    isClearAdaptiveData = true;

                    line = lineStream.Read();
                    if (!string.IsNullOrEmpty(line))
                    {
                        throw new InvalidFormatException("Empty line after -DOCSTART- not empty: '" + line + "'!");
                    }

                    continue;
                }

                var fields = line.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);


                if (language == Language.En && fields.Length == 4)
                {
                    // For English: WORD  POS-TAG SC-TAG NE-TAG
                    sentence.Add(fields[0]);
                    tags.Add(fields[3]);
                }
                else if (language == Language.De && fields.Length == 5)
                {
                    // For German: WORD  LEMA-TAG POS-TAG SC-TAG NE-TAG
                    sentence.Add(fields[0]);
                    tags.Add(fields[4]);
                }
                else
                {
                    throw new InvalidFormatException(
                              $"Incorrect number of fields per line for language: '{line}'!");
                }
            }

            if (sentence.Count > 0)
            {
                // convert name tags into spans
                var names = new List <Span>();

                var beginIndex = -1;
                var endIndex   = -1;
                for (var i = 0; i < tags.Count; i++)
                {
                    var tag = tags[i];

                    if (tag.EndsWith("PER") && (types & Types.PersonEntities) == 0)
                    {
                        tag = "O";
                    }

                    if (tag.EndsWith("ORG") && (types & Types.OrganizationEntities) == 0)
                    {
                        tag = "O";
                    }

                    if (tag.EndsWith("LOC") && (types & Types.LocationEntities) == 0)
                    {
                        tag = "O";
                    }

                    if (tag.EndsWith("MISC") && (types & Types.MiscEntities) == 0)
                    {
                        tag = "O";
                    }

                    if (tag == "O")
                    {
                        if (beginIndex == -1)
                        {
                            continue;
                        }

                        names.Add(Extract(beginIndex, endIndex, tags[beginIndex]));
                        beginIndex = -1;
                        endIndex   = -1;
                    }
                    else if (tag.StartsWith("B-"))
                    {
                        // B- prefix means we have two same entities next to each other
                        if (beginIndex != -1)
                        {
                            names.Add(Extract(beginIndex, endIndex, tags[beginIndex]));
                        }
                        beginIndex = i;
                        endIndex   = i + 1;
                    }
                    else if (tag.StartsWith("I-"))
                    {
                        // I- starts or continues a current name entity
                        if (beginIndex == -1)
                        {
                            beginIndex = i;
                            endIndex   = i + 1;
                        }
                        else if (!tag.EndsWith(tags[beginIndex].Substring(1)))
                        {
                            // we have a new tag type following a tagged word series
                            // also may not have the same I- starting the previous!
                            names.Add(Extract(beginIndex, endIndex, tags[beginIndex]));
                            beginIndex = i;
                            endIndex   = i + 1;
                        }
                        else
                        {
                            endIndex++;
                        }
                    }
                    else
                    {
                        throw new InvalidFormatException("Invalid tag: " + tag);
                    }
                }

                // if one span remains, create it here
                if (beginIndex != -1)
                {
                    names.Add(Extract(beginIndex, endIndex, tags[beginIndex]));
                }

                return(new NameSample(sentence.ToArray(), names.ToArray(), isClearAdaptiveData));
            }
            if (line != null)
            {
                // Just filter out empty events, if two lines in a row are empty
                return(Read());
            }
            // source stream is not returning anymore lines
            return(null);
        }