public static MarcellDocument ToDocument(this LuceneDocument source) { MarcellDocument result = new MarcellDocument { Id = source.GetValues("Id")?.FirstOrDefault(), InternalId = Guid.Parse(source.GetValues("InternalId").First()), ApprovalDate = source.GetDate("ApprovalDate"), DocumentDate = source.GetDate("DocumentDate"), EffectiveDate = source.GetDate("EffectiveDate"), DocumentSimilarityData = new SimilarityData { ConsolidatedTokens = source.GetStringList("DocumentToken"), ConsolidatedTopics = source.GetStringList("DocumentTopic"), }, TokenCount = source.GetInt("TokenCount"), DocumentType = source.GetValues("DocumentType")?.FirstOrDefault(), OriginalType = source.GetValues("OriginalType")?.FirstOrDefault(), Issuer = source.GetValues("Issuer")?.FirstOrDefault(), Language = source.GetValues("Language")?.FirstOrDefault(), Url = source.GetValues("Url")?.FirstOrDefault(), RecognitionQuality = source.GetDouble("RecognitionQuality"), IsStructured = source.GetBool("IsStructured"), FileName = source.GetValues("FileName")?.FirstOrDefault(), }; return(result); }
public static LuceneDocument ToLucene(this MarcellDocument sourceDocument) { LuceneDocument result = new LuceneDocument(); //Add basic document data result.AddStringField("Id", sourceDocument.Id ?? sourceDocument.InternalId.ToString("N"), Field.Store.YES); result.AddStringField("InternalId", sourceDocument.InternalId.ToString("N"), Field.Store.YES); result.AddDateField("ApprovalDate", sourceDocument.ApprovalDate); result.AddDateField("DocumentDate", sourceDocument.DocumentDate); result.AddDateField("EffectiveDate", sourceDocument.EffectiveDate); result.AddStringList("DocumentToken", sourceDocument.DocumentSimilarityData.ConsolidatedTokens); result.AddStringList("DocumentTopic", sourceDocument.DocumentSimilarityData.ConsolidatedTopics); result.AddInt32Field("TokenCount", sourceDocument.TokenCount, Field.Store.YES); result.AddTextField("DocumentType", sourceDocument.DocumentType ?? "", Field.Store.YES); result.AddTextField("OriginalType", sourceDocument.OriginalType ?? "", Field.Store.YES); result.AddTextField("Issuer", sourceDocument.Issuer ?? "", Field.Store.YES); result.AddStringField("Language", sourceDocument.Language, Field.Store.YES); result.AddStringField("Url", sourceDocument.Url ?? "", Field.Store.YES); result.AddScoredDoubleField("RecognitionQuality", sourceDocument.RecognitionQuality); result.AddBoolField("IsStructured", sourceDocument.IsStructured); if (sourceDocument.FileName != null) { result.AddStringField("FileName", sourceDocument.FileName, Field.Store.YES); } return(result); }
public static LuceneDocument ToLucene(this Section sourceSection, MarcellDocument parentDocument) { LuceneDocument result = new LuceneDocument(); //Add basic document data result.AddStringField("Id", sourceSection.Id ?? sourceSection.InternalId.ToString("N"), Field.Store.YES); result.AddStringField("InternalId", sourceSection.InternalId.ToString("N"), Field.Store.YES); result.AddStringList("DocumentToken", sourceSection.DocumentSimilarityData.ConsolidatedTokens); result.AddStringList("DocumentTopic", sourceSection.DocumentSimilarityData.ConsolidatedTopics); result.AddStringList("SectionToken", sourceSection.SectionSimilarityData.ConsolidatedTokens); result.AddStringList("SectionTopic", sourceSection.SectionSimilarityData.ConsolidatedTopics); result.AddInt32Field("TokenCount", sourceSection.TokenCount, Field.Store.YES); result.AddStringField("Language", sourceSection.Language, Field.Store.YES); result.AddScoredDoubleField("RecognitionQuality", sourceSection.RecognitionQuality); result.AddStringField("Type", sourceSection.Type.ToString(), Field.Store.YES); string sectionText = sourceSection.Text ?? ""; result.AddTextField("Text", sectionText, Field.Store.YES); //Add reference to parent document if (parentDocument != null) { result.AddStringField("ParentDocumentId", parentDocument.InternalId.ToString("N"), Field.Store.YES); } return(result); }
private Token GetParsedToken(textDocPSToken token, MarcellDocument document, int curPos) { var iateEntries = ParseMarcellList(token.marcell_iate, out List <string> iateDomains, "IATE"); return(new Token { Deprel = token.deprel, Deps = token.deps, EuroVocEntities = ParseMarcellList(token.marcell_eurovoc, out _, "EV"), Features = ParseMarcellFeatures(token.feats), Form = token.form, GeneralPos = token.upos, HeadId = token.head, IateEntities = iateEntries, IateDomains = iateDomains, Id = token.id.ToString(), InternalId = Guid.NewGuid(), Language = document.Language, LanguagePos = token.xpos, Lemma = token.lemma, Misc = token.misc, NE = token.marcell_ne, NP = token.marcell_np, Order = curPos }); }
private void AddSectionToDocument(MarcellDocument document, Section section) { section.SectionSimilarityData.ConsolidatedTokens = section.SectionSimilarityData.ConsolidatedTokens.Distinct().ToList(); section.SectionSimilarityData.ConsolidatedTopics = section.SectionSimilarityData.ConsolidatedTopics.Distinct().ToList(); section.RecognitionQuality = section.Paragraphs.Average(s => s.RecognitionQuality); document.TokenCount += section.TokenCount; document.DocumentSimilarityData.ConsolidatedTokens.AddRange(section.SectionSimilarityData.ConsolidatedTokens); document.DocumentSimilarityData.ConsolidatedTopics.AddRange(section.SectionSimilarityData.ConsolidatedTopics); document.Sections.Add(section); }
public static LuceneDocument ToLucene(this Sentence sourceSentence, MarcellDocument parentDocument, Section parentSection, Paragraph parentParagraph) { LuceneDocument result = new LuceneDocument(); //Add basic document data result.AddStringField("Id", sourceSentence.Id ?? sourceSentence.InternalId.ToString("N"), Field.Store.YES); result.AddStringField("InternalId", sourceSentence.InternalId.ToString("N"), Field.Store.YES); result.AddStringList("DocumentToken", sourceSentence.DocumentSimilarityData.ConsolidatedTokens); result.AddStringList("DocumentTopic", sourceSentence.DocumentSimilarityData.ConsolidatedTopics); result.AddStringList("SectionToken", sourceSentence.SectionSimilarityData.ConsolidatedTokens); result.AddStringList("SectionTopic", sourceSentence.SectionSimilarityData.ConsolidatedTopics); result.AddStringList("ParagraphToken", sourceSentence.ParagraphSimilarityData.ConsolidatedTokens); result.AddStringList("ParagraphTopic", sourceSentence.ParagraphSimilarityData.ConsolidatedTopics); result.AddStringList("SentenceToken", sourceSentence.SentenceSimilarityData.ConsolidatedTokens); result.AddStringList("SentenceTopic", sourceSentence.SentenceSimilarityData.ConsolidatedTopics); result.AddInt32Field("TokenCount", sourceSentence.TokenCount, Field.Store.YES); result.AddStringField("Language", sourceSentence.Language, Field.Store.YES); result.AddScoredDoubleField("RecognitionQuality", sourceSentence.RecognitionQuality); result.AddInt32Field("Order", sourceSentence.Order, Field.Store.YES); string sentencehText = sourceSentence.Text ?? ""; result.AddTextField("Text", sentencehText, Field.Store.YES); //Add reference to parent document if (parentDocument != null) { result.AddStringField("ParentDocumentId", parentDocument.InternalId.ToString("N"), Field.Store.YES); } //Add reference to parent section if (parentSection != null) { result.AddStringField("ParentSectionId", parentSection.InternalId.ToString("N"), Field.Store.YES); } //Add reference to parent section if (parentParagraph != null) { result.AddStringField("ParentParagraphId", parentParagraph.InternalId.ToString("N"), Field.Store.YES); } result.AddStringList("ContainedTokenEV", sourceSentence.Tokens.SelectMany(t => t.EuroVocEntities), Field.Store.NO); result.AddStringList("ContainedTokenIATE", sourceSentence.Tokens.SelectMany(t => t.IateEntities), Field.Store.NO); return(result); }
public void ParsePass2(MarcellDocument document) { //We now move all data back from parents to children for easier searching in Phase 2 //We add additional metadata for indexing, such as token numbers, recognition quality and similar - this is then used in the similarity function to determine the similar paragraphs foreach (var section in document.Sections) { section.DocumentSimilarityData = document.DocumentSimilarityData; foreach (var paragraph in section.Paragraphs) { paragraph.DocumentSimilarityData = document.DocumentSimilarityData; paragraph.SectionSimilarityData = section.SectionSimilarityData; foreach (var sentence in paragraph.Sentences) { sentence.DocumentSimilarityData = document.DocumentSimilarityData; sentence.SectionSimilarityData = section.SectionSimilarityData; sentence.ParagraphSimilarityData = paragraph.ParagraphSimilarityData; } } } }
public static LuceneDocument ToLucene(this Paragraph sourceParagraph, MarcellDocument parentDocument, Section parentSection) { LuceneDocument result = new LuceneDocument(); //Add basic document data result.AddStringField("Id", sourceParagraph.Id ?? sourceParagraph.InternalId.ToString("N"), Field.Store.YES); result.AddStringField("InternalId", sourceParagraph.InternalId.ToString("N"), Field.Store.YES); result.AddStringList("DocumentToken", sourceParagraph.DocumentSimilarityData.ConsolidatedTokens); result.AddStringList("DocumentTopic", sourceParagraph.DocumentSimilarityData.ConsolidatedTopics); result.AddStringList("SectionToken", sourceParagraph.SectionSimilarityData.ConsolidatedTokens); result.AddStringList("SectionTopic", sourceParagraph.SectionSimilarityData.ConsolidatedTopics); result.AddStringList("ParagraphToken", sourceParagraph.ParagraphSimilarityData.ConsolidatedTokens); result.AddStringList("ParagraphTopic", sourceParagraph.ParagraphSimilarityData.ConsolidatedTopics); result.AddInt32Field("TokenCount", sourceParagraph.TokenCount, Field.Store.YES); result.AddStringField("Language", sourceParagraph.Language, Field.Store.YES); result.AddScoredDoubleField("RecognitionQuality", sourceParagraph.RecognitionQuality); result.AddInt32Field("Order", sourceParagraph.Order, Field.Store.YES); result.AddStringField("ParagraphType", sourceParagraph.ParagraphType.ToString(), Field.Store.YES); result.AddStringField("ParagraphNumber", sourceParagraph.ParagraphNumber ?? "", Field.Store.YES); result.AddStringField("PointNumber", sourceParagraph.PointNumber ?? "", Field.Store.YES); string paragraphText = sourceParagraph.Text ?? ""; result.AddTextField("Text", paragraphText, Field.Store.YES); //Add reference to parent document if (parentDocument != null) { result.AddStringField("ParentDocumentId", parentDocument.InternalId.ToString("N"), Field.Store.YES); } //Add reference to parent section if (parentSection != null) { result.AddStringField("ParentSectionId", parentSection.InternalId.ToString("N"), Field.Store.YES); } return(result); }
public MarcellDocument ParsePass1(CoNLLDocument sourceDocument) { //Initialize the document with basic information, new Guid as the internal Id and as an unstructured document MarcellDocument document = GetBasicDocument(sourceDocument); Section currentSection = new Section() { InternalId = Guid.NewGuid(), Language = document.Language }; Section nextSection = null; bool sectionCommited = true; int curPar = 0; //Go through the document and combine all sentences into paragraphs, propagating keywords upwards from children to parents foreach (var paragraph in sourceDocument.doc.p) { Paragraph currentParagraph = new Paragraph() { Id = paragraph.id, InternalId = Guid.NewGuid(), Language = document.Language, ParagraphNumber = curPar.ToString(), Order = curPar }; curPar++; StringBuilder paragraphText = new StringBuilder(); int curSent = 0; foreach (var sentence in paragraph.s) { if (sentence != null && !string.IsNullOrEmpty(sentence.text) && IsNewSection(sentence.text, currentSection, out nextSection)) { if (!sectionCommited) { AddSectionToDocument(document, currentSection); sectionCommited = true; } currentSection = nextSection; } Sentence currentSentence = new Sentence { Id = sentence.id, InternalId = Guid.NewGuid(), Language = document.Language, Text = sentence.text, Order = curSent, }; curSent++; int curPos = 0; int recognizedTokens = 0; int totalTokens = 0; foreach (var token in sentence.token) { Token currentToken = GetParsedToken(token, document, curPos); currentToken.SimilarityData.ConsolidatedTokens.AddRange(currentToken.EuroVocEntities); currentToken.SimilarityData.ConsolidatedTokens.AddRange(currentToken.IateEntities); currentToken.SimilarityData.ConsolidatedTopics.AddRange(currentToken.IateDomains); if (currentToken.GeneralPos != "PUNCT") { if (currentToken.EuroVocEntities.Count > 0) { recognizedTokens++; } if (currentToken.IateEntities.Count > 0) { recognizedTokens++; } totalTokens += 2; } currentSentence.SentenceSimilarityData.ConsolidatedTokens.AddRange(currentToken.SimilarityData.ConsolidatedTokens); currentSentence.SentenceSimilarityData.ConsolidatedTopics.AddRange(currentToken.SimilarityData.ConsolidatedTopics); currentSentence.Tokens.Add(currentToken); currentSentence.TokenCount++; curPos++; } currentSentence.SentenceSimilarityData.ConsolidatedTokens = currentSentence.SentenceSimilarityData.ConsolidatedTokens.Distinct().ToList(); currentSentence.SentenceSimilarityData.ConsolidatedTopics = currentSentence.SentenceSimilarityData.ConsolidatedTopics.Distinct().ToList(); if (totalTokens > 0) { double sentenceQuality = (double)recognizedTokens / (double)totalTokens; currentSentence.RecognitionQuality = sentenceQuality; } else { currentSentence.RecognitionQuality = 1; } paragraphText.Append(currentSentence.Text).Append(" "); currentParagraph.Sentences.Add(currentSentence); currentParagraph.ParagraphSimilarityData.ConsolidatedTokens.AddRange(currentSentence.SentenceSimilarityData.ConsolidatedTokens); currentParagraph.ParagraphSimilarityData.ConsolidatedTopics.AddRange(currentSentence.SentenceSimilarityData.ConsolidatedTopics); currentParagraph.TokenCount += currentSentence.TokenCount; } currentParagraph.Text = paragraphText.ToString(); currentSection.TextStringBuilder.AppendLine(currentParagraph.Text); currentSection.TokenCount += currentParagraph.TokenCount; currentParagraph.RecognitionQuality = currentParagraph.Sentences.Average(s => s.RecognitionQuality); currentParagraph.ParagraphSimilarityData.ConsolidatedTokens = currentParagraph.ParagraphSimilarityData.ConsolidatedTokens.Distinct().ToList(); currentParagraph.ParagraphSimilarityData.ConsolidatedTopics = currentParagraph.ParagraphSimilarityData.ConsolidatedTopics.Distinct().ToList(); currentSection.SectionSimilarityData.ConsolidatedTokens.AddRange(currentParagraph.ParagraphSimilarityData.ConsolidatedTokens); currentSection.SectionSimilarityData.ConsolidatedTopics.AddRange(currentParagraph.ParagraphSimilarityData.ConsolidatedTopics); currentSection.Paragraphs.Add(currentParagraph); sectionCommited = false; } //If the last section has been modified after last commit, add it to the document if (!sectionCommited) { AddSectionToDocument(document, currentSection); } document.RecognitionQuality = document.Sections.Average(s => s.RecognitionQuality); if (document.Sections.Count > 1) { document.IsStructured = true; } return(document); }