コード例 #1
0
        //--------------------------------------------------------------------------
        public Phrase AddPhrase(string phrase)
        {
            DocumentParser parsedPhrase = DocumentParser.FromPhrase(phrase);

            DocumentWord documentWord = null;
            List <Word>  words        = new List <Word>();

            while (parsedPhrase.GetNextWord(out documentWord))
            {
                Word word = WordsService.Instance.GetWord(documentWord.Text);
                words.Add(word);
            }

            if (words.Count < kMinimalWordsInPhrase)
            {
                throw new Exception("Phrases must have, at least, " +
                                    kMinimalWordsInPhrase + " words!");
            }

            Phrase resultPhrase = new Phrase()
            {
                Words = words
            };

            PhrasesDao.Insert(ref resultPhrase);
            return(resultPhrase);
        }
コード例 #2
0
        //--------------------------------------------------------------------------
        private bool ValidateIntegrity(DocumentWord word, FileStream fsValidation)
        {
            fsValidation.Seek(word.OffsetInFile, SeekOrigin.Begin);

            int byteCount = UTF8Encoding.UTF8.GetByteCount(word.Text);

            byte[] buffer = new byte[byteCount];
            fsValidation.Read(buffer, 0, byteCount);
            string validationWord = UTF8Encoding.UTF8.GetString(buffer);

            if (validationWord.CompareTo(word.Text) != 0)
            {
                string description = String.Format(
                    "Word: {0}, Line: {1}, Offset: {2}",
                    word.Text,
                    word.Line,
                    word.OffsetInFile);

                fsValidation.Close();
                throw new Exception(
                          "file parsing integrity validation failed:\n" + description);
            }

            return(true);
        }
コード例 #3
0
        //-------------------------------------------------------------------------
        /// <summary>
        ///
        /// </summary>
        /// <param name="doc"></param>
        /// <param name="word"></param>
        /// <param name="documentWord"></param>
        public void Insert(Document doc, Word word, DocumentWord documentWord)
        {
            Contains contains = new Contains()
            {
                DocumentId      = doc.Id,
                WordId          = word.Id,
                Line            = documentWord.Line,
                FileOffset      = documentWord.OffsetInFile,
                IndexInSentence = documentWord.IndexInSentence,
                Sentence        = documentWord.Sentence,
                Paragraph       = documentWord.Paragraph,
                Page            = documentWord.Page
            };

            ContainsDao.Insert(contains);
        }
コード例 #4
0
        //--------------------------------------------------------------------------
        /// <summary>
        ///
        /// </summary>
        /// <returns></returns>
        public bool GetNextWord(out DocumentWord word)
        {
            word = new DocumentWord();

            if (_stanfordDocument == null)
            {
                throw new Exception("no document parsed");
            }

            // iterate sentences until we reach a qualified word, or end of the
            // document (valid end is when we find an end mark)
            while (NextSentenceCheck())
            {
                // did we reach the end of the document?
                if (CheckIfEndMark(_currentSentence))
                {
                    return(false);
                }

                // get next qualified word
                int i = (int)_currentPosition.InternalWord;
                for (; i < _currentSentence.words.Count; ++i)
                {
                    InternalWord currentWord = _currentSentence.words[i];
                    _currentPosition.Calculate(currentWord);

                    if (_qualifiedWords.Check(_currentSentence, currentWord, i))
                    {
                        _currentPosition.FillWord(ref word, currentWord);
                        _currentPosition.IncreaseWord(true); // next word index
                        return(true);
                    }

                    _currentPosition.IncreaseWord(false);
                }
            }

            // if we are in a phrase, we just stop at the end of the phrase
            if (!_fromFile)
            {
                return(false);
            }

            // if we've reached here it means we've reached the end of the document
            // without a valid end mark... so we throw an exception
            throw new Exception("Unexpected end of document reached!");
        }
コード例 #5
0
        //--------------------------------------------------------------------------
        // Inserts the document's words into the Words and Contains services
        private void InsertWords(
            FileInfo file, DocumentParser documentParser, Document doc)
        {
            bool         cancelRequest = false;
            DocumentWord documentWord  = null;

            // we will validate the integrity of our results by seeking each
            // word's position to the source file and asserting the word in the offset
            // of the file matches
            FileStream fsValidation =
                new FileStream(doc.LocalFile.FullName, FileMode.Open);

            while (documentParser.GetNextWord(out documentWord) && !cancelRequest)
            {
                //performing this validation will make loading documents really slow.
                if (GlobalParamatersService.Configuration.PerformIntegrityValidations)
                {
                    ValidateIntegrity(documentWord, fsValidation);
                }

                Word word = WordsService.Instance.GetWord(documentWord.Text);
                ContainsService.Instance.Insert(doc, word, documentWord);

                float percent =
                    (float)documentWord.Sentence / (float)documentParser.SentencesCount;
                percent *= 100;
                GlobalParamatersService.Delegate.OnLoadDocumentProcessing(
                    file, (int)percent, ref cancelRequest);
            }

            if (cancelRequest)
            {
                throw new Exception("cancel was requested during document loading");
            }

            fsValidation.Close();

            // just making sure we get to 100% as we may have to skip invalid words
            // and not reach 100%
            GlobalParamatersService.Delegate.OnLoadDocumentProcessing(
                file, 100, ref cancelRequest);
        }
コード例 #6
0
        public void ProcessWords(string FilePath, string DocumentHash)
        {
            string FolderPath = DocumentFile.GetFileNameFromPath(FilePath);
            string FileName   = DocumentFile.GetFolderFromPath(FilePath);

            // add words
            string[] Words = DocumentFile.ExtractDocumentWords(FolderPath, FileName);
            {
                foreach (string Word in Words)
                {
                    DocumentWord dw = new DocumentWord()
                    {
                        DocumentHash = DocumentHash,
                        Word         = Word,
                        FilePath     = FilePath
                    };

                    AppContext.DocumentWords.Add(dw);
                }
            }
        }
コード例 #7
0
        internal void FillWord(ref DocumentWord word, InternalWord currentWord)
        {
            word.Text            = currentWord.originalText;
            word.Page            = _page;
            word.Paragraph       = _paragraph;
            word.Sentence        = _sentence;
            word.IndexInSentence = _logicalWordInSentence;
            word.Line            = _line;

            // calculate file offset
            word.OffsetInFile = currentWord.characterOffsetBegin + _currentByteOffset;

            // if the offset is due to the current word - remove the offset part of
            // the current word.  We do this because we want to keep |FillWord| as
            // something that doesn't changes the internal state of
            // CurrentPositionCalculator - unlike the: |Calculate| method
            uint count = (uint)UTF8Encoding.UTF8.GetByteCount(
                currentWord.originalText);

            if (count > currentWord.originalText.Length)
            {
                word.OffsetInFile -= (uint)(count - currentWord.originalText.Length);
            }
        }
コード例 #8
0
        // expects 1 mandatory command line argument and 1 optional
        // mandatory - txt file
        // optional - storage location (folder)
        static void Main(string[] args)
        {
            // 1 - [, {, (, ...
            // 2 - 's, n't, 'll ...
            // 3 - class normalize high-ascii chars

            //Open the stream and read it back.

            // test parsing a phrase
            DocumentParser phrase = DocumentParser.FromPhrase(
                "tolerance and who wrangles with everybody who does not" + "\n" +
                "do as he would like them to");

            DocumentWord aWord = null;

            while (phrase.GetNextWord(out aWord))
            {
            }

            // test parsing a file
            if (args.Length < 1)
            {
                Console.WriteLine("You need to pass a Gutenberg text file as a parameter");
                return;
            }

            FileInfo documentFile  = new FileInfo(args[0]);
            string   workDirectory = documentFile.DirectoryName;

            if (args.Length > 1)
            {
                workDirectory = args[1];
            }

            DocumentParser document = null;

            try {
                document = DocumentParser.FromFile(documentFile);
            } catch (Exception ex) {
                Console.WriteLine("Failed to parse document: ", ex);
                return;
            }

            if (document == null)
            {
                Console.WriteLine("Failed to parse document");
                return;
            }

            Console.WriteLine("");
            Console.WriteLine("Document parsed successfully");
            Console.WriteLine("");
            Console.WriteLine("The meta-data:");
            //Console.WriteLine("Filename: " + document.Filename);

            foreach (var metaData in document.MetaData)
            {
                Console.WriteLine(
                    String.Format("{0}:\t{1}", metaData.Key, metaData.Value));
            }

            Dictionary <string, int> concordance = new Dictionary <string, int>();

            int wordCounter = 0;

            document.Save(new FileInfo(@"D:\Release\txt files\temp.txt"), true);
            FileStream fs = new FileStream(@"D:\Release\txt files\temp.txt", FileMode.Open);
            //string fileData = File.ReadAllText(@"D:\Release\txt files\temp.txt");

            DocumentWord word = null;

            while (document.GetNextWord(out word))
            {
                wordCounter++;

                //if (fileData[(int)word.OffsetInFile] != word.Text[0]) {
                //fs.Seek(word.OffsetInFile+utf8Offset, SeekOrigin.Begin);
                fs.Seek(word.OffsetInFile, SeekOrigin.Begin);

                int  count     = System.Text.UTF8Encoding.UTF8.GetByteCount(word.Text);
                char firstChar = (char)System.Text.UTF8Encoding.UTF8.GetBytes(word.Text)[0];

                char b = (char)fs.ReadByte();
                if (b != firstChar)
                {
                    Console.WriteLine("");
                    Console.WriteLine("Problematic word found - differnt from file:");
                    Console.WriteLine("text: " + word.Text);
                    Console.WriteLine("offset: " + word.OffsetInFile.ToString());
                    Console.WriteLine("page (100 lines per page): " + (word.Page + 1).ToString());
                    Console.WriteLine("paragraph: " + (word.Paragraph + 1).ToString());
                    Console.WriteLine("sentence: " + (word.Sentence + 1).ToString());
                    Console.WriteLine("word in sentence: " + (word.IndexInSentence + 1).ToString());
                    Console.WriteLine("line: " + (word.Line + 1).ToString());

                    Console.WriteLine("\r\nPress any key to continue...");
                    Console.ReadKey();
                }

                string lowerCaseWord = word.Text.ToLower();

                if (!concordance.ContainsKey(lowerCaseWord))
                {
                    concordance.Add(lowerCaseWord, 1);
                }
                else
                {
                    concordance[lowerCaseWord]++;
                }
            }

            Console.WriteLine("\r\nPress any key to continue...");
            Console.ReadKey();

            // sort it
            var list = concordance.ToList();

            list.Sort((pair1, pair2) => pair2.Value.CompareTo(pair1.Value));

            Console.WriteLine("");
            Console.WriteLine("Top 50 words:");
            for (int i = 0; i < list.Count; i++)
            {
                if (list[i].Value > 100)
                {
                    Console.WriteLine(
                        String.Format("{0}. {1} - {2}", i + 1, list[i].Key, list[i].Value));
                }
            }

            Console.WriteLine("");
            Console.WriteLine("Unknown suffixes found in the document:");
            foreach (var suffix in document.UnknownSuffixes)
            {
                Console.WriteLine(suffix);
            }


            Console.WriteLine("\r\nPress any key to continue...");
            Console.ReadKey();
        }