/// <summary> /// Examples: /// 1. *** END OF THIS PROJECT GUTENBERG EBOOK PRIDE AND PREJUDICE *** /// 2. ***END OF THE PROJECT GUTENBERG EBOOK THE DEFENCE OF LUCKNOW*** /// </summary> /// <param name="sentence"></param> /// <returns></returns> private bool CheckIfEndMark(InternalSentence sentence) { // at least "***" + "END" + "OF" and ends with "***" if (sentence.words.Count < 4) { return(false); } // first word should be *** if (!sentence.words[0].annotatedText.Equals("***")) { return(false); } // the next two words should be: START OF if (!sentence.words[1].annotatedText.ToLower().Equals("end")) { return(false); } if (!sentence.words[2].annotatedText.ToLower().Equals("of")) { return(false); } // finally, the sentence should if (!sentence.words[sentence.words.Count - 1].annotatedText.Equals( "***")) { return(false); } return(true); }
/// <summary> /// Extracts a Stanford-Word to a models.Word and adds to sentence /// </summary> /// <param name="stanfordWord"></param> /// <returns></returns> private bool ExtractWord( CoreMap stanfordWord, ref InternalSentence sentence) { uint offsetBegin, offsetEnd; if (!UInt32.TryParse( stanfordWord.get(_charOffsetBeginAnnotationClass).ToString(), out offsetBegin)) { return(false); } if (!UInt32.TryParse( stanfordWord.get(_charOffsetEndAnnotationClass).ToString(), out offsetEnd)) { return(false); } sentence.words.Add(new InternalWord() { annotatedText = stanfordWord.get(_textAnnotationClass).ToString(), //originalText = stanfordWord.get(_originalTextAnnotationClass).ToString(), originalText = _rawData.Substring((int)offsetBegin, (int)offsetEnd - (int)offsetBegin), characterOffsetBegin = offsetBegin, characterOffsetEnd = offsetEnd }); return(true); }
//-------------------------------------------------------------------------- /// <summary> /// We check that the document has an End Mark after we already found the /// start mark (when parsing the Meta Data). /// we also check it has at least 1 sentence that isn't the end mark right /// after the start mark /// </summary> /// <returns></returns> private bool CheckDocumentValidityAfterMetaData() { uint sentenceIndex = _currentPosition.Sentence; bool foundContentSentence = false; InternalSentence sentence = new InternalSentence(); while (sentenceIndex < _stanfordDocument.SentencesCount) { if (!_stanfordDocument.GetSentence(sentenceIndex++, ref sentence)) { return(false); } // if we've reached the content start mark - stop parsing meta data if (CheckIfEndMark(sentence)) { return(foundContentSentence); } else { // this means we have at least 1 sentence that isn't the end mark foundContentSentence = true; } } return(false); }
/// <summary> /// Called when parsing a phrase and not a gutenberg file /// </summary> /// <returns></returns> private bool InitForParsing() { _currentSentence = new InternalSentence(); if (!_stanfordDocument.GetSentence( _currentPosition.Sentence, ref _currentSentence)) { return(false); } return(true); }
//------------------------------------------------------------------------- /// <summary> /// /// Examples: /// Title: Pride and Prejudice /// /// Title: The Defence of Lucknow /// A Diary Recording the Daily Events during the Siege of the... /// /// Author: Jane Austen /// Posting Date: August 26, 2008 [EBook #1342] /// Release Date: June, 1998 /// Last updated: February 15, 2015] /// [Last updated: December 20, 2011] /// Language: English /// /// /// Sometimes Stanford will return a sentence with multiple (Key: Value) /// pairs - so we will not use Stanford word parsing here /// </summary> /// <param name="sentence"></param> public void Parse(InternalSentence sentence) { // first, we split the sentence into lines - this will help tackling both // multi-line values (like the Title: The Defence of...) // and multi-field sentences - like the dates example above // // we handle both Windows and Unix style newline var splitData = sentence.text.Split( new string[] { Environment.NewLine }, StringSplitOptions.None); int index = 0; while (index < splitData.Length) { string currentItem = splitData[index]; // try to get Gutenberg id: [EBook #id] TryExtractingGutenbergId(currentItem); TrimBrackets(ref currentItem); // extract (key, value) pair string key = "", value = ""; if (!ExtractKeyValue(currentItem, ref key, ref value)) { index++; continue; } if (_whitelistedKeys.ContainsKey(key.ToLower())) { int linesAdded; AppendNextLinesToValue(splitData, index, ref value, out linesAdded); index += linesAdded; } else { // if it isn't a whitelisted key - we need ot clean up values // according to what we know we don't need (by reviewing documents) CleanUpValue(ref value); } index++; // add to member variable _metaDataFields.Add(key.ToLower(), value); } }
/// <summary> /// /// </summary> /// <param name="index"></param> /// <returns></returns> public bool GetSentence(uint index, ref InternalSentence sentence) { if ((index < 0) || (index > (this.SentencesCount - 1))) { throw new IndexOutOfRangeException(); } // extract the basic sentence fields CoreMap stanfordSentence = (CoreMap)_sentences.get((int)index); if (!ExtractSentence(stanfordSentence, ref sentence)) { return(false); } // extract words java.util.ArrayList stanfordWords = (java.util.ArrayList)stanfordSentence.get(_tokensAnnotationClass); if (stanfordWords.size() <= 0) { return(true); } // a "minor" optimisation (capacity) sentence.words = new System.Collections.Generic.List <InternalWord>( stanfordWords.size()); foreach (CoreMap stanfordWord in stanfordWords) { if (!ExtractWord(stanfordWord, ref sentence)) { // NOTE: currently, we fail the entire sentence even if a single // word fails return(false); } } ; return(true); }
public bool Check( InternalSentence sentence, InternalWord word, int wordInSentence) { if (!PassesOneCharAlphaNumericCheck(word)) { return(false); } if (!PassesSuffixCheck(sentence, word, wordInSentence)) { return(false); } if (!PassesBlacklistRepeatedCharsCheck(word)) { return(false); } return(true); }
/// <summary> /// Extracts the sentence members of a Standford Sentence to a models /// sentence /// </summary> /// <param name="stanfordSentence"></param> /// <param name="sentence"></param> /// <returns></returns> private bool ExtractSentence( CoreMap stanfordSentence, ref InternalSentence sentence) { sentence.text = stanfordSentence.get(_textAnnotationClass).ToString(); if (!UInt32.TryParse( stanfordSentence.get(_charOffsetBeginAnnotationClass).ToString(), out sentence.fileOffsetBegin)) { return(false); } if (!UInt32.TryParse( stanfordSentence.get(_charOffsetEndAnnotationClass).ToString(), out sentence.fileOffsetEnd)) { return(false); } return(true); }
/// <summary> /// Check if this word is only a suffix for a qualified word. /// We and ignore them - for example: /// Tom's house... /// Tom couldn't (n't) /// You're /// /// NOTE: we might want to make a specific suffix part of the last /// word (Could + n't = Couldn't) - in that case, we need to change /// the |Check| function altogether /// </summary> /// <param name="sentence"></param> /// <param name="word"></param> /// <param name="wordInSentence"></param> /// <returns></returns> private bool PassesSuffixCheck( InternalSentence sentence, InternalWord word, int wordInSentence) { // first, get the last word int lastWordIndex = wordInSentence - 1; if (lastWordIndex < 0) { // first word of the sentence return(true); } InternalWord lastWord = sentence.words[lastWordIndex]; // check there are no spaces/characters between the last word and the // suffix if (lastWord.characterOffsetEnd != word.characterOffsetBegin) { return(true); } // is this an ignorable suffix? string lowerCaseWord = word.annotatedText.ToLower(); if (_ignorableSuffixes.Contains(lowerCaseWord)) { return(false); // we want to ignore it } // don't ignore it - add to unkonwn list only if it contains // a non-alphanumeric character Regex rgx = new Regex(@"[a-z0-9]+", RegexOptions.IgnoreCase); if (!rgx.IsMatch(lowerCaseWord)) { _unknownSuffixes.Add(lowerCaseWord); } return(true); }
/// <summary> /// Parses document meta-data such as: Author, Title, Language... /// /// The function iterates all lines until reaching the: *** START OF ... /// - which is the "start mark" /// </summary> /// <returns></returns> private bool ParseMetaData() { _currentSentence = new InternalSentence(); while (_currentPosition.Sentence < _stanfordDocument.SentencesCount) { if (!_stanfordDocument.GetSentence( _currentPosition.Sentence, ref _currentSentence)) { return(false); } // even if we have reached the content start mark - update positions // so that when we parse words we keep track of the current positions foreach (var internalWord in _currentSentence.words) { _currentPosition.Calculate(internalWord); } // make sure we move to the next sentence in our counter so that we // read the next sentence on the next GetNextWord call _currentPosition.IncreaseSentence(); // if we've reached the content start mark - stop parsing meta data if (CheckIfStartMark(_currentSentence)) { _currentPosition.ResetWordInSentence(); // to be sure this happens // found the start mark - read next sentence to _currentSentence return(_stanfordDocument.GetSentence( _currentPosition.Sentence, ref _currentSentence)); } // try extracting meta data from the sentence _metaDataParser.Parse(_currentSentence); } return(false); }