Example #1
0
 private MarcellDocument GetBasicDocument(CoNLLDocument sourceDocument)
 {
     return(new MarcellDocument
     {
         ApprovalDate = sourceDocument.doc.date_approved,
         DocumentDate = sourceDocument.doc.date,
         DocumentType = sourceDocument.doc.entype,
         EffectiveDate = sourceDocument.doc.date_effect,
         InternalId = Guid.NewGuid(),
         IsStructured = false,
         Id = sourceDocument.doc.id,
         Issuer = sourceDocument.doc.issuer,
         Language = sourceDocument.doc.language,
         OriginalType = sourceDocument.doc.type,
         Url = sourceDocument.doc.url
     });
 }
Example #2
0
        public MarcellDocument ParsePass1(CoNLLDocument sourceDocument)
        {
            //Initialize the document with basic information, new Guid as the internal Id and as an unstructured document
            MarcellDocument document = GetBasicDocument(sourceDocument);

            Section currentSection = new Section()
            {
                InternalId = Guid.NewGuid(),
                Language   = document.Language
            };
            Section nextSection     = null;
            bool    sectionCommited = true;
            int     curPar          = 0;

            //Go through the document and combine all sentences into paragraphs, propagating keywords upwards from children to parents
            foreach (var paragraph in sourceDocument.doc.p)
            {
                Paragraph currentParagraph = new Paragraph()
                {
                    Id              = paragraph.id,
                    InternalId      = Guid.NewGuid(),
                    Language        = document.Language,
                    ParagraphNumber = curPar.ToString(),
                    Order           = curPar
                };
                curPar++;

                StringBuilder paragraphText = new StringBuilder();
                int           curSent       = 0;
                foreach (var sentence in paragraph.s)
                {
                    if (sentence != null && !string.IsNullOrEmpty(sentence.text) && IsNewSection(sentence.text, currentSection, out nextSection))
                    {
                        if (!sectionCommited)
                        {
                            AddSectionToDocument(document, currentSection);
                            sectionCommited = true;
                        }
                        currentSection = nextSection;
                    }

                    Sentence currentSentence = new Sentence
                    {
                        Id         = sentence.id,
                        InternalId = Guid.NewGuid(),
                        Language   = document.Language,
                        Text       = sentence.text,
                        Order      = curSent,
                    };
                    curSent++;
                    int curPos           = 0;
                    int recognizedTokens = 0;
                    int totalTokens      = 0;
                    foreach (var token in sentence.token)
                    {
                        Token currentToken = GetParsedToken(token, document, curPos);

                        currentToken.SimilarityData.ConsolidatedTokens.AddRange(currentToken.EuroVocEntities);
                        currentToken.SimilarityData.ConsolidatedTokens.AddRange(currentToken.IateEntities);
                        currentToken.SimilarityData.ConsolidatedTopics.AddRange(currentToken.IateDomains);

                        if (currentToken.GeneralPos != "PUNCT")
                        {
                            if (currentToken.EuroVocEntities.Count > 0)
                            {
                                recognizedTokens++;
                            }
                            if (currentToken.IateEntities.Count > 0)
                            {
                                recognizedTokens++;
                            }
                            totalTokens += 2;
                        }

                        currentSentence.SentenceSimilarityData.ConsolidatedTokens.AddRange(currentToken.SimilarityData.ConsolidatedTokens);
                        currentSentence.SentenceSimilarityData.ConsolidatedTopics.AddRange(currentToken.SimilarityData.ConsolidatedTopics);

                        currentSentence.Tokens.Add(currentToken);
                        currentSentence.TokenCount++;

                        curPos++;
                    }

                    currentSentence.SentenceSimilarityData.ConsolidatedTokens = currentSentence.SentenceSimilarityData.ConsolidatedTokens.Distinct().ToList();
                    currentSentence.SentenceSimilarityData.ConsolidatedTopics = currentSentence.SentenceSimilarityData.ConsolidatedTopics.Distinct().ToList();

                    if (totalTokens > 0)
                    {
                        double sentenceQuality = (double)recognizedTokens / (double)totalTokens;
                        currentSentence.RecognitionQuality = sentenceQuality;
                    }
                    else
                    {
                        currentSentence.RecognitionQuality = 1;
                    }

                    paragraphText.Append(currentSentence.Text).Append(" ");
                    currentParagraph.Sentences.Add(currentSentence);
                    currentParagraph.ParagraphSimilarityData.ConsolidatedTokens.AddRange(currentSentence.SentenceSimilarityData.ConsolidatedTokens);
                    currentParagraph.ParagraphSimilarityData.ConsolidatedTopics.AddRange(currentSentence.SentenceSimilarityData.ConsolidatedTopics);
                    currentParagraph.TokenCount += currentSentence.TokenCount;
                }
                currentParagraph.Text = paragraphText.ToString();
                currentSection.TextStringBuilder.AppendLine(currentParagraph.Text);
                currentSection.TokenCount          += currentParagraph.TokenCount;
                currentParagraph.RecognitionQuality = currentParagraph.Sentences.Average(s => s.RecognitionQuality);

                currentParagraph.ParagraphSimilarityData.ConsolidatedTokens = currentParagraph.ParagraphSimilarityData.ConsolidatedTokens.Distinct().ToList();
                currentParagraph.ParagraphSimilarityData.ConsolidatedTopics = currentParagraph.ParagraphSimilarityData.ConsolidatedTopics.Distinct().ToList();

                currentSection.SectionSimilarityData.ConsolidatedTokens.AddRange(currentParagraph.ParagraphSimilarityData.ConsolidatedTokens);
                currentSection.SectionSimilarityData.ConsolidatedTopics.AddRange(currentParagraph.ParagraphSimilarityData.ConsolidatedTopics);

                currentSection.Paragraphs.Add(currentParagraph);
                sectionCommited = false;
            }

            //If the last section has been modified after last commit, add it to the document
            if (!sectionCommited)
            {
                AddSectionToDocument(document, currentSection);
            }

            document.RecognitionQuality = document.Sections.Average(s => s.RecognitionQuality);
            if (document.Sections.Count > 1)
            {
                document.IsStructured = true;
            }
            return(document);
        }