//-------------------------------------------------------------------------- public Phrase AddPhrase(string phrase) { DocumentParser parsedPhrase = DocumentParser.FromPhrase(phrase); DocumentWord documentWord = null; List <Word> words = new List <Word>(); while (parsedPhrase.GetNextWord(out documentWord)) { Word word = WordsService.Instance.GetWord(documentWord.Text); words.Add(word); } if (words.Count < kMinimalWordsInPhrase) { throw new Exception("Phrases must have, at least, " + kMinimalWordsInPhrase + " words!"); } Phrase resultPhrase = new Phrase() { Words = words }; PhrasesDao.Insert(ref resultPhrase); return(resultPhrase); }
//-------------------------------------------------------------------------- private bool ValidateIntegrity(DocumentWord word, FileStream fsValidation) { fsValidation.Seek(word.OffsetInFile, SeekOrigin.Begin); int byteCount = UTF8Encoding.UTF8.GetByteCount(word.Text); byte[] buffer = new byte[byteCount]; fsValidation.Read(buffer, 0, byteCount); string validationWord = UTF8Encoding.UTF8.GetString(buffer); if (validationWord.CompareTo(word.Text) != 0) { string description = String.Format( "Word: {0}, Line: {1}, Offset: {2}", word.Text, word.Line, word.OffsetInFile); fsValidation.Close(); throw new Exception( "file parsing integrity validation failed:\n" + description); } return(true); }
//------------------------------------------------------------------------- /// <summary> /// /// </summary> /// <param name="doc"></param> /// <param name="word"></param> /// <param name="documentWord"></param> public void Insert(Document doc, Word word, DocumentWord documentWord) { Contains contains = new Contains() { DocumentId = doc.Id, WordId = word.Id, Line = documentWord.Line, FileOffset = documentWord.OffsetInFile, IndexInSentence = documentWord.IndexInSentence, Sentence = documentWord.Sentence, Paragraph = documentWord.Paragraph, Page = documentWord.Page }; ContainsDao.Insert(contains); }
//-------------------------------------------------------------------------- /// <summary> /// /// </summary> /// <returns></returns> public bool GetNextWord(out DocumentWord word) { word = new DocumentWord(); if (_stanfordDocument == null) { throw new Exception("no document parsed"); } // iterate sentences until we reach a qualified word, or end of the // document (valid end is when we find an end mark) while (NextSentenceCheck()) { // did we reach the end of the document? if (CheckIfEndMark(_currentSentence)) { return(false); } // get next qualified word int i = (int)_currentPosition.InternalWord; for (; i < _currentSentence.words.Count; ++i) { InternalWord currentWord = _currentSentence.words[i]; _currentPosition.Calculate(currentWord); if (_qualifiedWords.Check(_currentSentence, currentWord, i)) { _currentPosition.FillWord(ref word, currentWord); _currentPosition.IncreaseWord(true); // next word index return(true); } _currentPosition.IncreaseWord(false); } } // if we are in a phrase, we just stop at the end of the phrase if (!_fromFile) { return(false); } // if we've reached here it means we've reached the end of the document // without a valid end mark... so we throw an exception throw new Exception("Unexpected end of document reached!"); }
//-------------------------------------------------------------------------- // Inserts the document's words into the Words and Contains services private void InsertWords( FileInfo file, DocumentParser documentParser, Document doc) { bool cancelRequest = false; DocumentWord documentWord = null; // we will validate the integrity of our results by seeking each // word's position to the source file and asserting the word in the offset // of the file matches FileStream fsValidation = new FileStream(doc.LocalFile.FullName, FileMode.Open); while (documentParser.GetNextWord(out documentWord) && !cancelRequest) { //performing this validation will make loading documents really slow. if (GlobalParamatersService.Configuration.PerformIntegrityValidations) { ValidateIntegrity(documentWord, fsValidation); } Word word = WordsService.Instance.GetWord(documentWord.Text); ContainsService.Instance.Insert(doc, word, documentWord); float percent = (float)documentWord.Sentence / (float)documentParser.SentencesCount; percent *= 100; GlobalParamatersService.Delegate.OnLoadDocumentProcessing( file, (int)percent, ref cancelRequest); } if (cancelRequest) { throw new Exception("cancel was requested during document loading"); } fsValidation.Close(); // just making sure we get to 100% as we may have to skip invalid words // and not reach 100% GlobalParamatersService.Delegate.OnLoadDocumentProcessing( file, 100, ref cancelRequest); }
public void ProcessWords(string FilePath, string DocumentHash) { string FolderPath = DocumentFile.GetFileNameFromPath(FilePath); string FileName = DocumentFile.GetFolderFromPath(FilePath); // add words string[] Words = DocumentFile.ExtractDocumentWords(FolderPath, FileName); { foreach (string Word in Words) { DocumentWord dw = new DocumentWord() { DocumentHash = DocumentHash, Word = Word, FilePath = FilePath }; AppContext.DocumentWords.Add(dw); } } }
internal void FillWord(ref DocumentWord word, InternalWord currentWord) { word.Text = currentWord.originalText; word.Page = _page; word.Paragraph = _paragraph; word.Sentence = _sentence; word.IndexInSentence = _logicalWordInSentence; word.Line = _line; // calculate file offset word.OffsetInFile = currentWord.characterOffsetBegin + _currentByteOffset; // if the offset is due to the current word - remove the offset part of // the current word. We do this because we want to keep |FillWord| as // something that doesn't changes the internal state of // CurrentPositionCalculator - unlike the: |Calculate| method uint count = (uint)UTF8Encoding.UTF8.GetByteCount( currentWord.originalText); if (count > currentWord.originalText.Length) { word.OffsetInFile -= (uint)(count - currentWord.originalText.Length); } }
// expects 1 mandatory command line argument and 1 optional // mandatory - txt file // optional - storage location (folder) static void Main(string[] args) { // 1 - [, {, (, ... // 2 - 's, n't, 'll ... // 3 - class normalize high-ascii chars //Open the stream and read it back. // test parsing a phrase DocumentParser phrase = DocumentParser.FromPhrase( "tolerance and who wrangles with everybody who does not" + "\n" + "do as he would like them to"); DocumentWord aWord = null; while (phrase.GetNextWord(out aWord)) { } // test parsing a file if (args.Length < 1) { Console.WriteLine("You need to pass a Gutenberg text file as a parameter"); return; } FileInfo documentFile = new FileInfo(args[0]); string workDirectory = documentFile.DirectoryName; if (args.Length > 1) { workDirectory = args[1]; } DocumentParser document = null; try { document = DocumentParser.FromFile(documentFile); } catch (Exception ex) { Console.WriteLine("Failed to parse document: ", ex); return; } if (document == null) { Console.WriteLine("Failed to parse document"); return; } Console.WriteLine(""); Console.WriteLine("Document parsed successfully"); Console.WriteLine(""); Console.WriteLine("The meta-data:"); //Console.WriteLine("Filename: " + document.Filename); foreach (var metaData in document.MetaData) { Console.WriteLine( String.Format("{0}:\t{1}", metaData.Key, metaData.Value)); } Dictionary <string, int> concordance = new Dictionary <string, int>(); int wordCounter = 0; document.Save(new FileInfo(@"D:\Release\txt files\temp.txt"), true); FileStream fs = new FileStream(@"D:\Release\txt files\temp.txt", FileMode.Open); //string fileData = File.ReadAllText(@"D:\Release\txt files\temp.txt"); DocumentWord word = null; while (document.GetNextWord(out word)) { wordCounter++; //if (fileData[(int)word.OffsetInFile] != word.Text[0]) { //fs.Seek(word.OffsetInFile+utf8Offset, SeekOrigin.Begin); fs.Seek(word.OffsetInFile, SeekOrigin.Begin); int count = System.Text.UTF8Encoding.UTF8.GetByteCount(word.Text); char firstChar = (char)System.Text.UTF8Encoding.UTF8.GetBytes(word.Text)[0]; char b = (char)fs.ReadByte(); if (b != firstChar) { Console.WriteLine(""); Console.WriteLine("Problematic word found - differnt from file:"); Console.WriteLine("text: " + word.Text); Console.WriteLine("offset: " + word.OffsetInFile.ToString()); Console.WriteLine("page (100 lines per page): " + (word.Page + 1).ToString()); Console.WriteLine("paragraph: " + (word.Paragraph + 1).ToString()); Console.WriteLine("sentence: " + (word.Sentence + 1).ToString()); Console.WriteLine("word in sentence: " + (word.IndexInSentence + 1).ToString()); Console.WriteLine("line: " + (word.Line + 1).ToString()); Console.WriteLine("\r\nPress any key to continue..."); Console.ReadKey(); } string lowerCaseWord = word.Text.ToLower(); if (!concordance.ContainsKey(lowerCaseWord)) { concordance.Add(lowerCaseWord, 1); } else { concordance[lowerCaseWord]++; } } Console.WriteLine("\r\nPress any key to continue..."); Console.ReadKey(); // sort it var list = concordance.ToList(); list.Sort((pair1, pair2) => pair2.Value.CompareTo(pair1.Value)); Console.WriteLine(""); Console.WriteLine("Top 50 words:"); for (int i = 0; i < list.Count; i++) { if (list[i].Value > 100) { Console.WriteLine( String.Format("{0}. {1} - {2}", i + 1, list[i].Key, list[i].Value)); } } Console.WriteLine(""); Console.WriteLine("Unknown suffixes found in the document:"); foreach (var suffix in document.UnknownSuffixes) { Console.WriteLine(suffix); } Console.WriteLine("\r\nPress any key to continue..."); Console.ReadKey(); }