Exemple #1
0
        /// <summary>
        /// Examples:
        /// 1. *** END OF THIS PROJECT GUTENBERG EBOOK PRIDE AND PREJUDICE ***
        /// 2. ***END OF THE PROJECT GUTENBERG EBOOK THE DEFENCE OF LUCKNOW***
        /// </summary>
        /// <param name="sentence"></param>
        /// <returns></returns>
        private bool CheckIfEndMark(InternalSentence sentence)
        {
            // at least "***" + "END" + "OF" and ends with "***"
            if (sentence.words.Count < 4)
            {
                return(false);
            }

            // first word should be ***
            if (!sentence.words[0].annotatedText.Equals("***"))
            {
                return(false);
            }

            // the next two words should be: START OF
            if (!sentence.words[1].annotatedText.ToLower().Equals("end"))
            {
                return(false);
            }

            if (!sentence.words[2].annotatedText.ToLower().Equals("of"))
            {
                return(false);
            }

            // finally, the sentence should
            if (!sentence.words[sentence.words.Count - 1].annotatedText.Equals(
                    "***"))
            {
                return(false);
            }

            return(true);
        }
        /// <summary>
        /// Extracts a Stanford-Word to a models.Word and adds to sentence
        /// </summary>
        /// <param name="stanfordWord"></param>
        /// <returns></returns>
        private bool ExtractWord(
            CoreMap stanfordWord, ref InternalSentence sentence)
        {
            uint offsetBegin, offsetEnd;

            if (!UInt32.TryParse(
                    stanfordWord.get(_charOffsetBeginAnnotationClass).ToString(),
                    out offsetBegin))
            {
                return(false);
            }

            if (!UInt32.TryParse(
                    stanfordWord.get(_charOffsetEndAnnotationClass).ToString(),
                    out offsetEnd))
            {
                return(false);
            }

            sentence.words.Add(new InternalWord()
            {
                annotatedText = stanfordWord.get(_textAnnotationClass).ToString(),
                //originalText = stanfordWord.get(_originalTextAnnotationClass).ToString(),
                originalText         = _rawData.Substring((int)offsetBegin, (int)offsetEnd - (int)offsetBegin),
                characterOffsetBegin = offsetBegin,
                characterOffsetEnd   = offsetEnd
            });

            return(true);
        }
Exemple #3
0
        //--------------------------------------------------------------------------
        /// <summary>
        /// We check that the document has an End Mark after we already found the
        /// start mark (when parsing the Meta Data).
        /// we also check it has at least 1 sentence that isn't the end mark right
        /// after the start mark
        /// </summary>
        /// <returns></returns>
        private bool CheckDocumentValidityAfterMetaData()
        {
            uint sentenceIndex        = _currentPosition.Sentence;
            bool foundContentSentence = false;

            InternalSentence sentence = new InternalSentence();

            while (sentenceIndex < _stanfordDocument.SentencesCount)
            {
                if (!_stanfordDocument.GetSentence(sentenceIndex++, ref sentence))
                {
                    return(false);
                }

                // if we've reached the content start mark - stop parsing meta data
                if (CheckIfEndMark(sentence))
                {
                    return(foundContentSentence);
                }
                else
                {
                    // this means we have at least 1 sentence that isn't the end mark
                    foundContentSentence = true;
                }
            }

            return(false);
        }
Exemple #4
0
        /// <summary>
        /// Called when parsing a phrase and not a gutenberg file
        /// </summary>
        /// <returns></returns>
        private bool InitForParsing()
        {
            _currentSentence = new InternalSentence();

            if (!_stanfordDocument.GetSentence(
                    _currentPosition.Sentence, ref _currentSentence))
            {
                return(false);
            }

            return(true);
        }
        //-------------------------------------------------------------------------
        /// <summary>
        ///
        /// Examples:
        /// Title: Pride and Prejudice
        ///
        /// Title: The Defence of Lucknow
        ///    A Diary Recording the Daily Events during the Siege of the...
        ///
        /// Author: Jane Austen
        /// Posting Date: August 26, 2008 [EBook #1342]
        /// Release Date: June, 1998
        /// Last updated: February 15, 2015]
        /// [Last updated: December 20, 2011]
        /// Language: English
        ///
        ///
        /// Sometimes Stanford will return a sentence with multiple (Key: Value)
        /// pairs - so we will not use Stanford word parsing here
        /// </summary>
        /// <param name="sentence"></param>
        public void Parse(InternalSentence sentence)
        {
            // first, we split the sentence into lines - this will help tackling both
            // multi-line values (like the Title: The Defence of...)
            // and multi-field sentences - like the dates example above
            //
            // we handle both Windows and Unix style newline
            var splitData = sentence.text.Split(
                new string[] { Environment.NewLine }, StringSplitOptions.None);

            int index = 0;

            while (index < splitData.Length)
            {
                string currentItem = splitData[index];
                // try to get Gutenberg id: [EBook #id]
                TryExtractingGutenbergId(currentItem);

                TrimBrackets(ref currentItem);

                // extract (key, value) pair
                string key = "", value = "";
                if (!ExtractKeyValue(currentItem, ref key, ref value))
                {
                    index++;
                    continue;
                }

                if (_whitelistedKeys.ContainsKey(key.ToLower()))
                {
                    int linesAdded;
                    AppendNextLinesToValue(splitData, index, ref value, out linesAdded);
                    index += linesAdded;
                }
                else
                {
                    // if it isn't a whitelisted key - we need ot clean up values
                    // according to what we know we don't need (by reviewing documents)
                    CleanUpValue(ref value);
                }

                index++;
                // add to member variable
                _metaDataFields.Add(key.ToLower(), value);
            }
        }
        /// <summary>
        ///
        /// </summary>
        /// <param name="index"></param>
        /// <returns></returns>
        public bool GetSentence(uint index, ref InternalSentence sentence)
        {
            if ((index < 0) ||
                (index > (this.SentencesCount - 1)))
            {
                throw new IndexOutOfRangeException();
            }

            // extract the basic sentence fields
            CoreMap stanfordSentence = (CoreMap)_sentences.get((int)index);

            if (!ExtractSentence(stanfordSentence, ref sentence))
            {
                return(false);
            }

            // extract words
            java.util.ArrayList stanfordWords =
                (java.util.ArrayList)stanfordSentence.get(_tokensAnnotationClass);

            if (stanfordWords.size() <= 0)
            {
                return(true);
            }

            // a "minor" optimisation (capacity)
            sentence.words = new System.Collections.Generic.List <InternalWord>(
                stanfordWords.size());

            foreach (CoreMap stanfordWord in stanfordWords)
            {
                if (!ExtractWord(stanfordWord, ref sentence))
                {
                    // NOTE: currently, we fail the entire sentence even if a single
                    // word fails
                    return(false);
                }
            }
            ;

            return(true);
        }
        public bool Check(
            InternalSentence sentence, InternalWord word, int wordInSentence)
        {
            if (!PassesOneCharAlphaNumericCheck(word))
            {
                return(false);
            }

            if (!PassesSuffixCheck(sentence, word, wordInSentence))
            {
                return(false);
            }

            if (!PassesBlacklistRepeatedCharsCheck(word))
            {
                return(false);
            }

            return(true);
        }
        /// <summary>
        /// Extracts the sentence members of a Standford Sentence to a models
        /// sentence
        /// </summary>
        /// <param name="stanfordSentence"></param>
        /// <param name="sentence"></param>
        /// <returns></returns>
        private bool ExtractSentence(
            CoreMap stanfordSentence, ref InternalSentence sentence)
        {
            sentence.text = stanfordSentence.get(_textAnnotationClass).ToString();

            if (!UInt32.TryParse(
                    stanfordSentence.get(_charOffsetBeginAnnotationClass).ToString(),
                    out sentence.fileOffsetBegin))
            {
                return(false);
            }

            if (!UInt32.TryParse(
                    stanfordSentence.get(_charOffsetEndAnnotationClass).ToString(),
                    out sentence.fileOffsetEnd))
            {
                return(false);
            }

            return(true);
        }
        /// <summary>
        /// Check if this word is only a suffix for a qualified word.
        /// We and ignore them - for example:
        /// Tom's house...
        /// Tom couldn't (n't)
        /// You're
        ///
        /// NOTE: we might want to make a specific suffix part of the last
        /// word (Could + n't = Couldn't) - in that case, we need to change
        /// the |Check| function altogether
        /// </summary>
        /// <param name="sentence"></param>
        /// <param name="word"></param>
        /// <param name="wordInSentence"></param>
        /// <returns></returns>
        private bool PassesSuffixCheck(
            InternalSentence sentence, InternalWord word, int wordInSentence)
        {
            // first, get the last word
            int lastWordIndex = wordInSentence - 1;

            if (lastWordIndex < 0)
            {
                // first word of the sentence
                return(true);
            }

            InternalWord lastWord = sentence.words[lastWordIndex];

            // check there are no spaces/characters between the last word and the
            // suffix
            if (lastWord.characterOffsetEnd != word.characterOffsetBegin)
            {
                return(true);
            }

            // is this an ignorable suffix?
            string lowerCaseWord = word.annotatedText.ToLower();

            if (_ignorableSuffixes.Contains(lowerCaseWord))
            {
                return(false); // we want to ignore it
            }

            // don't ignore it - add to unkonwn list only if it contains
            // a non-alphanumeric character
            Regex rgx = new Regex(@"[a-z0-9]+", RegexOptions.IgnoreCase);

            if (!rgx.IsMatch(lowerCaseWord))
            {
                _unknownSuffixes.Add(lowerCaseWord);
            }

            return(true);
        }
Exemple #10
0
        /// <summary>
        /// Parses document meta-data such as: Author, Title, Language...
        ///
        /// The function iterates all lines until reaching the: *** START OF ...
        /// - which is the "start mark"
        /// </summary>
        /// <returns></returns>
        private bool ParseMetaData()
        {
            _currentSentence = new InternalSentence();

            while (_currentPosition.Sentence < _stanfordDocument.SentencesCount)
            {
                if (!_stanfordDocument.GetSentence(
                        _currentPosition.Sentence, ref _currentSentence))
                {
                    return(false);
                }

                // even if we have reached the content start mark - update positions
                // so that when we parse words we keep track of the current positions
                foreach (var internalWord in _currentSentence.words)
                {
                    _currentPosition.Calculate(internalWord);
                }

                // make sure we move to the next sentence in our counter so that we
                // read the next sentence on the next GetNextWord call
                _currentPosition.IncreaseSentence();

                // if we've reached the content start mark - stop parsing meta data
                if (CheckIfStartMark(_currentSentence))
                {
                    _currentPosition.ResetWordInSentence(); // to be sure this happens
                    // found the start mark - read next sentence to _currentSentence
                    return(_stanfordDocument.GetSentence(
                               _currentPosition.Sentence, ref _currentSentence));
                }

                // try extracting meta data from the sentence
                _metaDataParser.Parse(_currentSentence);
            }

            return(false);
        }