Exemplo n.º 1
0
        /// <summary>
        ///
        /// </summary>
        /// <param name="currentWord"></param>
        /// <param name="qualifiedWord"></param>
        public void Calculate(InternalWord currentWord)
        {
            // start with the begining of the file/data
            int lastWordEndOffset = 0;

            if (_lastWord != null)
            {
                // if we had a word before, start with the end of
                // that word
                lastWordEndOffset = (int)_lastWord.characterOffsetEnd;
            }

            string delta = _rawData.Substring(
                lastWordEndOffset,
                (int)currentWord.characterOffsetBegin - lastWordEndOffset);

            _currentByteOffset += GetMultibyteOffset(delta);
            uint newLines = CountNewLines(delta);

            _line += newLines;

            // 2 or more newlines are treated as a new paragraph
            if (newLines > 1)
            {
                _paragraph++;
            }

            _currentByteOffset += GetMultibyteOffset(currentWord.originalText);

            _page     = _line / _linesPerPage;
            _lastWord = currentWord;
        }
        /// <summary>
        ///
        /// </summary>
        /// <param name="sentence"></param>
        /// <param name="word"></param>
        /// <param name="wordInSentence"></param>
        /// <returns></returns>
        private bool PassesBlacklistRepeatedCharsCheck(InternalWord word)
        {
            string lowerCaseWord = word.annotatedText.ToLower();
            Regex  rgx           = new Regex(_blacklistRepeatedCharsRegExp,
                                             RegexOptions.IgnoreCase);

            if (rgx.IsMatch(lowerCaseWord))
            {
                return(false);
            }

            return(true);
        }
        /// <summary>
        /// check non-alpha-numerics
        /// </summary>
        /// <param name="word"></param>
        /// <returns></returns>
        private bool PassesOneCharAlphaNumericCheck(InternalWord word)
        {
            // is it one character long?
            // we use file offsets to ignore annotations from Stanford
            if ((word.characterOffsetEnd - word.characterOffsetBegin) != 1)
            {
                return(true);
            }

            // if it is a single character - check if it is an alphanumeric char
            Regex rgx = new Regex(@"^[a-zA-Z0-9]$", RegexOptions.IgnoreCase);
            bool  ret = rgx.IsMatch(word.originalText);

            return(ret);
        }
Exemplo n.º 4
0
        //--------------------------------------------------------------------------
        /// <summary>
        ///
        /// </summary>
        /// <returns></returns>
        public bool GetNextWord(out DocumentWord word)
        {
            word = new DocumentWord();

            if (_stanfordDocument == null)
            {
                throw new Exception("no document parsed");
            }

            // iterate sentences until we reach a qualified word, or end of the
            // document (valid end is when we find an end mark)
            while (NextSentenceCheck())
            {
                // did we reach the end of the document?
                if (CheckIfEndMark(_currentSentence))
                {
                    return(false);
                }

                // get next qualified word
                int i = (int)_currentPosition.InternalWord;
                for (; i < _currentSentence.words.Count; ++i)
                {
                    InternalWord currentWord = _currentSentence.words[i];
                    _currentPosition.Calculate(currentWord);

                    if (_qualifiedWords.Check(_currentSentence, currentWord, i))
                    {
                        _currentPosition.FillWord(ref word, currentWord);
                        _currentPosition.IncreaseWord(true); // next word index
                        return(true);
                    }

                    _currentPosition.IncreaseWord(false);
                }
            }

            // if we are in a phrase, we just stop at the end of the phrase
            if (!_fromFile)
            {
                return(false);
            }

            // if we've reached here it means we've reached the end of the document
            // without a valid end mark... so we throw an exception
            throw new Exception("Unexpected end of document reached!");
        }
        public bool Check(
            InternalSentence sentence, InternalWord word, int wordInSentence)
        {
            if (!PassesOneCharAlphaNumericCheck(word))
            {
                return(false);
            }

            if (!PassesSuffixCheck(sentence, word, wordInSentence))
            {
                return(false);
            }

            if (!PassesBlacklistRepeatedCharsCheck(word))
            {
                return(false);
            }

            return(true);
        }
        /// <summary>
        /// Check if this word is only a suffix for a qualified word.
        /// We and ignore them - for example:
        /// Tom's house...
        /// Tom couldn't (n't)
        /// You're
        ///
        /// NOTE: we might want to make a specific suffix part of the last
        /// word (Could + n't = Couldn't) - in that case, we need to change
        /// the |Check| function altogether
        /// </summary>
        /// <param name="sentence"></param>
        /// <param name="word"></param>
        /// <param name="wordInSentence"></param>
        /// <returns></returns>
        private bool PassesSuffixCheck(
            InternalSentence sentence, InternalWord word, int wordInSentence)
        {
            // first, get the last word
            int lastWordIndex = wordInSentence - 1;

            if (lastWordIndex < 0)
            {
                // first word of the sentence
                return(true);
            }

            InternalWord lastWord = sentence.words[lastWordIndex];

            // check there are no spaces/characters between the last word and the
            // suffix
            if (lastWord.characterOffsetEnd != word.characterOffsetBegin)
            {
                return(true);
            }

            // is this an ignorable suffix?
            string lowerCaseWord = word.annotatedText.ToLower();

            if (_ignorableSuffixes.Contains(lowerCaseWord))
            {
                return(false); // we want to ignore it
            }

            // don't ignore it - add to unkonwn list only if it contains
            // a non-alphanumeric character
            Regex rgx = new Regex(@"[a-z0-9]+", RegexOptions.IgnoreCase);

            if (!rgx.IsMatch(lowerCaseWord))
            {
                _unknownSuffixes.Add(lowerCaseWord);
            }

            return(true);
        }
Exemplo n.º 7
0
        internal void FillWord(ref DocumentWord word, InternalWord currentWord)
        {
            word.Text            = currentWord.originalText;
            word.Page            = _page;
            word.Paragraph       = _paragraph;
            word.Sentence        = _sentence;
            word.IndexInSentence = _logicalWordInSentence;
            word.Line            = _line;

            // calculate file offset
            word.OffsetInFile = currentWord.characterOffsetBegin + _currentByteOffset;

            // if the offset is due to the current word - remove the offset part of
            // the current word.  We do this because we want to keep |FillWord| as
            // something that doesn't changes the internal state of
            // CurrentPositionCalculator - unlike the: |Calculate| method
            uint count = (uint)UTF8Encoding.UTF8.GetByteCount(
                currentWord.originalText);

            if (count > currentWord.originalText.Length)
            {
                word.OffsetInFile -= (uint)(count - currentWord.originalText.Length);
            }
        }