private MarcellDocument GetBasicDocument(CoNLLDocument sourceDocument) { return(new MarcellDocument { ApprovalDate = sourceDocument.doc.date_approved, DocumentDate = sourceDocument.doc.date, DocumentType = sourceDocument.doc.entype, EffectiveDate = sourceDocument.doc.date_effect, InternalId = Guid.NewGuid(), IsStructured = false, Id = sourceDocument.doc.id, Issuer = sourceDocument.doc.issuer, Language = sourceDocument.doc.language, OriginalType = sourceDocument.doc.type, Url = sourceDocument.doc.url }); }
public MarcellDocument ParsePass1(CoNLLDocument sourceDocument) { //Initialize the document with basic information, new Guid as the internal Id and as an unstructured document MarcellDocument document = GetBasicDocument(sourceDocument); Section currentSection = new Section() { InternalId = Guid.NewGuid(), Language = document.Language }; Section nextSection = null; bool sectionCommited = true; int curPar = 0; //Go through the document and combine all sentences into paragraphs, propagating keywords upwards from children to parents foreach (var paragraph in sourceDocument.doc.p) { Paragraph currentParagraph = new Paragraph() { Id = paragraph.id, InternalId = Guid.NewGuid(), Language = document.Language, ParagraphNumber = curPar.ToString(), Order = curPar }; curPar++; StringBuilder paragraphText = new StringBuilder(); int curSent = 0; foreach (var sentence in paragraph.s) { if (sentence != null && !string.IsNullOrEmpty(sentence.text) && IsNewSection(sentence.text, currentSection, out nextSection)) { if (!sectionCommited) { AddSectionToDocument(document, currentSection); sectionCommited = true; } currentSection = nextSection; } Sentence currentSentence = new Sentence { Id = sentence.id, InternalId = Guid.NewGuid(), Language = document.Language, Text = sentence.text, Order = curSent, }; curSent++; int curPos = 0; int recognizedTokens = 0; int totalTokens = 0; foreach (var token in sentence.token) { Token currentToken = GetParsedToken(token, document, curPos); currentToken.SimilarityData.ConsolidatedTokens.AddRange(currentToken.EuroVocEntities); currentToken.SimilarityData.ConsolidatedTokens.AddRange(currentToken.IateEntities); currentToken.SimilarityData.ConsolidatedTopics.AddRange(currentToken.IateDomains); if (currentToken.GeneralPos != "PUNCT") { if (currentToken.EuroVocEntities.Count > 0) { recognizedTokens++; } if (currentToken.IateEntities.Count > 0) { recognizedTokens++; } totalTokens += 2; } currentSentence.SentenceSimilarityData.ConsolidatedTokens.AddRange(currentToken.SimilarityData.ConsolidatedTokens); currentSentence.SentenceSimilarityData.ConsolidatedTopics.AddRange(currentToken.SimilarityData.ConsolidatedTopics); currentSentence.Tokens.Add(currentToken); currentSentence.TokenCount++; curPos++; } currentSentence.SentenceSimilarityData.ConsolidatedTokens = currentSentence.SentenceSimilarityData.ConsolidatedTokens.Distinct().ToList(); currentSentence.SentenceSimilarityData.ConsolidatedTopics = currentSentence.SentenceSimilarityData.ConsolidatedTopics.Distinct().ToList(); if (totalTokens > 0) { double sentenceQuality = (double)recognizedTokens / (double)totalTokens; currentSentence.RecognitionQuality = sentenceQuality; } else { currentSentence.RecognitionQuality = 1; } paragraphText.Append(currentSentence.Text).Append(" "); currentParagraph.Sentences.Add(currentSentence); currentParagraph.ParagraphSimilarityData.ConsolidatedTokens.AddRange(currentSentence.SentenceSimilarityData.ConsolidatedTokens); currentParagraph.ParagraphSimilarityData.ConsolidatedTopics.AddRange(currentSentence.SentenceSimilarityData.ConsolidatedTopics); currentParagraph.TokenCount += currentSentence.TokenCount; } currentParagraph.Text = paragraphText.ToString(); currentSection.TextStringBuilder.AppendLine(currentParagraph.Text); currentSection.TokenCount += currentParagraph.TokenCount; currentParagraph.RecognitionQuality = currentParagraph.Sentences.Average(s => s.RecognitionQuality); currentParagraph.ParagraphSimilarityData.ConsolidatedTokens = currentParagraph.ParagraphSimilarityData.ConsolidatedTokens.Distinct().ToList(); currentParagraph.ParagraphSimilarityData.ConsolidatedTopics = currentParagraph.ParagraphSimilarityData.ConsolidatedTopics.Distinct().ToList(); currentSection.SectionSimilarityData.ConsolidatedTokens.AddRange(currentParagraph.ParagraphSimilarityData.ConsolidatedTokens); currentSection.SectionSimilarityData.ConsolidatedTopics.AddRange(currentParagraph.ParagraphSimilarityData.ConsolidatedTopics); currentSection.Paragraphs.Add(currentParagraph); sectionCommited = false; } //If the last section has been modified after last commit, add it to the document if (!sectionCommited) { AddSectionToDocument(document, currentSection); } document.RecognitionQuality = document.Sections.Average(s => s.RecognitionQuality); if (document.Sections.Count > 1) { document.IsStructured = true; } return(document); }