public static MarcellDocument ToDocument(this LuceneDocument source)
        {
            MarcellDocument result = new MarcellDocument
            {
                Id                     = source.GetValues("Id")?.FirstOrDefault(),
                InternalId             = Guid.Parse(source.GetValues("InternalId").First()),
                ApprovalDate           = source.GetDate("ApprovalDate"),
                DocumentDate           = source.GetDate("DocumentDate"),
                EffectiveDate          = source.GetDate("EffectiveDate"),
                DocumentSimilarityData = new SimilarityData
                {
                    ConsolidatedTokens = source.GetStringList("DocumentToken"),
                    ConsolidatedTopics = source.GetStringList("DocumentTopic"),
                },
                TokenCount         = source.GetInt("TokenCount"),
                DocumentType       = source.GetValues("DocumentType")?.FirstOrDefault(),
                OriginalType       = source.GetValues("OriginalType")?.FirstOrDefault(),
                Issuer             = source.GetValues("Issuer")?.FirstOrDefault(),
                Language           = source.GetValues("Language")?.FirstOrDefault(),
                Url                = source.GetValues("Url")?.FirstOrDefault(),
                RecognitionQuality = source.GetDouble("RecognitionQuality"),
                IsStructured       = source.GetBool("IsStructured"),
                FileName           = source.GetValues("FileName")?.FirstOrDefault(),
            };

            return(result);
        }
        public static LuceneDocument ToLucene(this MarcellDocument sourceDocument)
        {
            LuceneDocument result = new LuceneDocument();

            //Add basic document data
            result.AddStringField("Id", sourceDocument.Id ?? sourceDocument.InternalId.ToString("N"), Field.Store.YES);
            result.AddStringField("InternalId", sourceDocument.InternalId.ToString("N"), Field.Store.YES);
            result.AddDateField("ApprovalDate", sourceDocument.ApprovalDate);
            result.AddDateField("DocumentDate", sourceDocument.DocumentDate);
            result.AddDateField("EffectiveDate", sourceDocument.EffectiveDate);
            result.AddStringList("DocumentToken", sourceDocument.DocumentSimilarityData.ConsolidatedTokens);
            result.AddStringList("DocumentTopic", sourceDocument.DocumentSimilarityData.ConsolidatedTopics);
            result.AddInt32Field("TokenCount", sourceDocument.TokenCount, Field.Store.YES);
            result.AddTextField("DocumentType", sourceDocument.DocumentType ?? "", Field.Store.YES);
            result.AddTextField("OriginalType", sourceDocument.OriginalType ?? "", Field.Store.YES);
            result.AddTextField("Issuer", sourceDocument.Issuer ?? "", Field.Store.YES);
            result.AddStringField("Language", sourceDocument.Language, Field.Store.YES);
            result.AddStringField("Url", sourceDocument.Url ?? "", Field.Store.YES);
            result.AddScoredDoubleField("RecognitionQuality", sourceDocument.RecognitionQuality);
            result.AddBoolField("IsStructured", sourceDocument.IsStructured);

            if (sourceDocument.FileName != null)
            {
                result.AddStringField("FileName", sourceDocument.FileName, Field.Store.YES);
            }

            return(result);
        }
        public static LuceneDocument ToLucene(this Section sourceSection, MarcellDocument parentDocument)
        {
            LuceneDocument result = new LuceneDocument();

            //Add basic document data
            result.AddStringField("Id", sourceSection.Id ?? sourceSection.InternalId.ToString("N"), Field.Store.YES);
            result.AddStringField("InternalId", sourceSection.InternalId.ToString("N"), Field.Store.YES);
            result.AddStringList("DocumentToken", sourceSection.DocumentSimilarityData.ConsolidatedTokens);
            result.AddStringList("DocumentTopic", sourceSection.DocumentSimilarityData.ConsolidatedTopics);
            result.AddStringList("SectionToken", sourceSection.SectionSimilarityData.ConsolidatedTokens);
            result.AddStringList("SectionTopic", sourceSection.SectionSimilarityData.ConsolidatedTopics);
            result.AddInt32Field("TokenCount", sourceSection.TokenCount, Field.Store.YES);
            result.AddStringField("Language", sourceSection.Language, Field.Store.YES);
            result.AddScoredDoubleField("RecognitionQuality", sourceSection.RecognitionQuality);

            result.AddStringField("Type", sourceSection.Type.ToString(), Field.Store.YES);

            string sectionText = sourceSection.Text ?? "";

            result.AddTextField("Text", sectionText, Field.Store.YES);

            //Add reference to parent document
            if (parentDocument != null)
            {
                result.AddStringField("ParentDocumentId", parentDocument.InternalId.ToString("N"), Field.Store.YES);
            }

            return(result);
        }
Example #4
0
        private Token GetParsedToken(textDocPSToken token, MarcellDocument document, int curPos)
        {
            var iateEntries = ParseMarcellList(token.marcell_iate, out List <string> iateDomains, "IATE");

            return(new Token
            {
                Deprel = token.deprel,
                Deps = token.deps,
                EuroVocEntities = ParseMarcellList(token.marcell_eurovoc, out _, "EV"),
                Features = ParseMarcellFeatures(token.feats),
                Form = token.form,
                GeneralPos = token.upos,
                HeadId = token.head,
                IateEntities = iateEntries,
                IateDomains = iateDomains,
                Id = token.id.ToString(),
                InternalId = Guid.NewGuid(),
                Language = document.Language,
                LanguagePos = token.xpos,
                Lemma = token.lemma,
                Misc = token.misc,
                NE = token.marcell_ne,
                NP = token.marcell_np,
                Order = curPos
            });
        }
Example #5
0
        private void AddSectionToDocument(MarcellDocument document, Section section)
        {
            section.SectionSimilarityData.ConsolidatedTokens = section.SectionSimilarityData.ConsolidatedTokens.Distinct().ToList();
            section.SectionSimilarityData.ConsolidatedTopics = section.SectionSimilarityData.ConsolidatedTopics.Distinct().ToList();

            section.RecognitionQuality = section.Paragraphs.Average(s => s.RecognitionQuality);

            document.TokenCount += section.TokenCount;
            document.DocumentSimilarityData.ConsolidatedTokens.AddRange(section.SectionSimilarityData.ConsolidatedTokens);
            document.DocumentSimilarityData.ConsolidatedTopics.AddRange(section.SectionSimilarityData.ConsolidatedTopics);
            document.Sections.Add(section);
        }
        public static LuceneDocument ToLucene(this Sentence sourceSentence, MarcellDocument parentDocument, Section parentSection, Paragraph parentParagraph)
        {
            LuceneDocument result = new LuceneDocument();

            //Add basic document data
            result.AddStringField("Id", sourceSentence.Id ?? sourceSentence.InternalId.ToString("N"), Field.Store.YES);
            result.AddStringField("InternalId", sourceSentence.InternalId.ToString("N"), Field.Store.YES);
            result.AddStringList("DocumentToken", sourceSentence.DocumentSimilarityData.ConsolidatedTokens);
            result.AddStringList("DocumentTopic", sourceSentence.DocumentSimilarityData.ConsolidatedTopics);
            result.AddStringList("SectionToken", sourceSentence.SectionSimilarityData.ConsolidatedTokens);
            result.AddStringList("SectionTopic", sourceSentence.SectionSimilarityData.ConsolidatedTopics);
            result.AddStringList("ParagraphToken", sourceSentence.ParagraphSimilarityData.ConsolidatedTokens);
            result.AddStringList("ParagraphTopic", sourceSentence.ParagraphSimilarityData.ConsolidatedTopics);
            result.AddStringList("SentenceToken", sourceSentence.SentenceSimilarityData.ConsolidatedTokens);
            result.AddStringList("SentenceTopic", sourceSentence.SentenceSimilarityData.ConsolidatedTopics);
            result.AddInt32Field("TokenCount", sourceSentence.TokenCount, Field.Store.YES);
            result.AddStringField("Language", sourceSentence.Language, Field.Store.YES);
            result.AddScoredDoubleField("RecognitionQuality", sourceSentence.RecognitionQuality);
            result.AddInt32Field("Order", sourceSentence.Order, Field.Store.YES);

            string sentencehText = sourceSentence.Text ?? "";

            result.AddTextField("Text", sentencehText, Field.Store.YES);

            //Add reference to parent document
            if (parentDocument != null)
            {
                result.AddStringField("ParentDocumentId", parentDocument.InternalId.ToString("N"), Field.Store.YES);
            }
            //Add reference to parent section
            if (parentSection != null)
            {
                result.AddStringField("ParentSectionId", parentSection.InternalId.ToString("N"), Field.Store.YES);
            }
            //Add reference to parent section
            if (parentParagraph != null)
            {
                result.AddStringField("ParentParagraphId", parentParagraph.InternalId.ToString("N"), Field.Store.YES);
            }

            result.AddStringList("ContainedTokenEV", sourceSentence.Tokens.SelectMany(t => t.EuroVocEntities), Field.Store.NO);
            result.AddStringList("ContainedTokenIATE", sourceSentence.Tokens.SelectMany(t => t.IateEntities), Field.Store.NO);

            return(result);
        }
Example #7
0
 public void ParsePass2(MarcellDocument document)
 {
     //We now move all data back from parents to children for easier searching in Phase 2
     //We add additional metadata for indexing, such as token numbers, recognition quality and similar - this is then used in the similarity function to determine the similar paragraphs
     foreach (var section in document.Sections)
     {
         section.DocumentSimilarityData = document.DocumentSimilarityData;
         foreach (var paragraph in section.Paragraphs)
         {
             paragraph.DocumentSimilarityData = document.DocumentSimilarityData;
             paragraph.SectionSimilarityData  = section.SectionSimilarityData;
             foreach (var sentence in paragraph.Sentences)
             {
                 sentence.DocumentSimilarityData  = document.DocumentSimilarityData;
                 sentence.SectionSimilarityData   = section.SectionSimilarityData;
                 sentence.ParagraphSimilarityData = paragraph.ParagraphSimilarityData;
             }
         }
     }
 }
        public static LuceneDocument ToLucene(this Paragraph sourceParagraph, MarcellDocument parentDocument, Section parentSection)
        {
            LuceneDocument result = new LuceneDocument();

            //Add basic document data
            result.AddStringField("Id", sourceParagraph.Id ?? sourceParagraph.InternalId.ToString("N"), Field.Store.YES);
            result.AddStringField("InternalId", sourceParagraph.InternalId.ToString("N"), Field.Store.YES);
            result.AddStringList("DocumentToken", sourceParagraph.DocumentSimilarityData.ConsolidatedTokens);
            result.AddStringList("DocumentTopic", sourceParagraph.DocumentSimilarityData.ConsolidatedTopics);
            result.AddStringList("SectionToken", sourceParagraph.SectionSimilarityData.ConsolidatedTokens);
            result.AddStringList("SectionTopic", sourceParagraph.SectionSimilarityData.ConsolidatedTopics);
            result.AddStringList("ParagraphToken", sourceParagraph.ParagraphSimilarityData.ConsolidatedTokens);
            result.AddStringList("ParagraphTopic", sourceParagraph.ParagraphSimilarityData.ConsolidatedTopics);
            result.AddInt32Field("TokenCount", sourceParagraph.TokenCount, Field.Store.YES);
            result.AddStringField("Language", sourceParagraph.Language, Field.Store.YES);
            result.AddScoredDoubleField("RecognitionQuality", sourceParagraph.RecognitionQuality);
            result.AddInt32Field("Order", sourceParagraph.Order, Field.Store.YES);

            result.AddStringField("ParagraphType", sourceParagraph.ParagraphType.ToString(), Field.Store.YES);
            result.AddStringField("ParagraphNumber", sourceParagraph.ParagraphNumber ?? "", Field.Store.YES);
            result.AddStringField("PointNumber", sourceParagraph.PointNumber ?? "", Field.Store.YES);

            string paragraphText = sourceParagraph.Text ?? "";

            result.AddTextField("Text", paragraphText, Field.Store.YES);

            //Add reference to parent document
            if (parentDocument != null)
            {
                result.AddStringField("ParentDocumentId", parentDocument.InternalId.ToString("N"), Field.Store.YES);
            }
            //Add reference to parent section
            if (parentSection != null)
            {
                result.AddStringField("ParentSectionId", parentSection.InternalId.ToString("N"), Field.Store.YES);
            }

            return(result);
        }
Example #9
0
        public MarcellDocument ParsePass1(CoNLLDocument sourceDocument)
        {
            //Initialize the document with basic information, new Guid as the internal Id and as an unstructured document
            MarcellDocument document = GetBasicDocument(sourceDocument);

            Section currentSection = new Section()
            {
                InternalId = Guid.NewGuid(),
                Language   = document.Language
            };
            Section nextSection     = null;
            bool    sectionCommited = true;
            int     curPar          = 0;

            //Go through the document and combine all sentences into paragraphs, propagating keywords upwards from children to parents
            foreach (var paragraph in sourceDocument.doc.p)
            {
                Paragraph currentParagraph = new Paragraph()
                {
                    Id              = paragraph.id,
                    InternalId      = Guid.NewGuid(),
                    Language        = document.Language,
                    ParagraphNumber = curPar.ToString(),
                    Order           = curPar
                };
                curPar++;

                StringBuilder paragraphText = new StringBuilder();
                int           curSent       = 0;
                foreach (var sentence in paragraph.s)
                {
                    if (sentence != null && !string.IsNullOrEmpty(sentence.text) && IsNewSection(sentence.text, currentSection, out nextSection))
                    {
                        if (!sectionCommited)
                        {
                            AddSectionToDocument(document, currentSection);
                            sectionCommited = true;
                        }
                        currentSection = nextSection;
                    }

                    Sentence currentSentence = new Sentence
                    {
                        Id         = sentence.id,
                        InternalId = Guid.NewGuid(),
                        Language   = document.Language,
                        Text       = sentence.text,
                        Order      = curSent,
                    };
                    curSent++;
                    int curPos           = 0;
                    int recognizedTokens = 0;
                    int totalTokens      = 0;
                    foreach (var token in sentence.token)
                    {
                        Token currentToken = GetParsedToken(token, document, curPos);

                        currentToken.SimilarityData.ConsolidatedTokens.AddRange(currentToken.EuroVocEntities);
                        currentToken.SimilarityData.ConsolidatedTokens.AddRange(currentToken.IateEntities);
                        currentToken.SimilarityData.ConsolidatedTopics.AddRange(currentToken.IateDomains);

                        if (currentToken.GeneralPos != "PUNCT")
                        {
                            if (currentToken.EuroVocEntities.Count > 0)
                            {
                                recognizedTokens++;
                            }
                            if (currentToken.IateEntities.Count > 0)
                            {
                                recognizedTokens++;
                            }
                            totalTokens += 2;
                        }

                        currentSentence.SentenceSimilarityData.ConsolidatedTokens.AddRange(currentToken.SimilarityData.ConsolidatedTokens);
                        currentSentence.SentenceSimilarityData.ConsolidatedTopics.AddRange(currentToken.SimilarityData.ConsolidatedTopics);

                        currentSentence.Tokens.Add(currentToken);
                        currentSentence.TokenCount++;

                        curPos++;
                    }

                    currentSentence.SentenceSimilarityData.ConsolidatedTokens = currentSentence.SentenceSimilarityData.ConsolidatedTokens.Distinct().ToList();
                    currentSentence.SentenceSimilarityData.ConsolidatedTopics = currentSentence.SentenceSimilarityData.ConsolidatedTopics.Distinct().ToList();

                    if (totalTokens > 0)
                    {
                        double sentenceQuality = (double)recognizedTokens / (double)totalTokens;
                        currentSentence.RecognitionQuality = sentenceQuality;
                    }
                    else
                    {
                        currentSentence.RecognitionQuality = 1;
                    }

                    paragraphText.Append(currentSentence.Text).Append(" ");
                    currentParagraph.Sentences.Add(currentSentence);
                    currentParagraph.ParagraphSimilarityData.ConsolidatedTokens.AddRange(currentSentence.SentenceSimilarityData.ConsolidatedTokens);
                    currentParagraph.ParagraphSimilarityData.ConsolidatedTopics.AddRange(currentSentence.SentenceSimilarityData.ConsolidatedTopics);
                    currentParagraph.TokenCount += currentSentence.TokenCount;
                }
                currentParagraph.Text = paragraphText.ToString();
                currentSection.TextStringBuilder.AppendLine(currentParagraph.Text);
                currentSection.TokenCount          += currentParagraph.TokenCount;
                currentParagraph.RecognitionQuality = currentParagraph.Sentences.Average(s => s.RecognitionQuality);

                currentParagraph.ParagraphSimilarityData.ConsolidatedTokens = currentParagraph.ParagraphSimilarityData.ConsolidatedTokens.Distinct().ToList();
                currentParagraph.ParagraphSimilarityData.ConsolidatedTopics = currentParagraph.ParagraphSimilarityData.ConsolidatedTopics.Distinct().ToList();

                currentSection.SectionSimilarityData.ConsolidatedTokens.AddRange(currentParagraph.ParagraphSimilarityData.ConsolidatedTokens);
                currentSection.SectionSimilarityData.ConsolidatedTopics.AddRange(currentParagraph.ParagraphSimilarityData.ConsolidatedTopics);

                currentSection.Paragraphs.Add(currentParagraph);
                sectionCommited = false;
            }

            //If the last section has been modified after last commit, add it to the document
            if (!sectionCommited)
            {
                AddSectionToDocument(document, currentSection);
            }

            document.RecognitionQuality = document.Sections.Average(s => s.RecognitionQuality);
            if (document.Sections.Count > 1)
            {
                document.IsStructured = true;
            }
            return(document);
        }