public List <Author> ImportAuthorsList() { List <Author> authors = new List <Author>(); foreach (var spreadsheetData in CatalogData) { List <Author> authorsOfOneBook; try { authorsOfOneBook = AuthorExtractor.Extract(spreadsheetData.Author); } catch (ExtractorException e) { _log.Error($"Author Extract Error: [{e.Text}]"); continue; } authors.AddRange(authorsOfOneBook); } var authorWithoutDoubles = authors .GroupBy(a => new { a.FirstName, a.LastName }) .Select(a => a.First()) .ToList(); return(authorWithoutDoubles); }
public void Extract_QuestionsMarks_ShouldReturnExtractException() { // Arrange string text = "???"; // Act Action act = () => AuthorExtractor.Extract(text); // Assert act.Should().Throw <ExtractorException>("Cannot extract data from text"); }
public void Extract_Null_ShouldReturnEmptyAuthorsList() { // Arrange string text = null; // Act var authors = AuthorExtractor.Extract(text); // Assert authors.Should().BeEmpty(); }
public void Extract_Inni_ShouldReturnEmptyAuthorsList() { // Arrange var text = "inni"; // Act var authors = AuthorExtractor.Extract(text); // Assert authors.Should().BeEmpty(); }
public void Extract_PauseWithApostrophe_ShouldReturnEmptyAuthorsList() { // Arrange var text = "'-"; // Act var authors = AuthorExtractor.Extract(text); // Assert authors.Should().BeEmpty(); }
public Unfluffer() { _htmlParser = new HtmlParser(); _languageExtractor = new LanguageExtractor(); _titleExtractor = new TitleExtractor(); _descriptionExtractor = new DescriptionExtractor(); _favIconExtractor = new FavIconExtractor(); _imageExtractor = new ImageExtractor(); _authorExtractor = new AuthorExtractor(); _canonicalExtractor = new CanonicalExtractor(); }
public List <Book> ImportBooksList() { var authors = ImportAuthorsList(); var seriesInfos = ImportSeriesListInfo(); var publishingHouses = ImportPublishingHousesList(); var storagePlaces = ImportStoragePlacesList(); var categories = ImportCategoriesList(); var seriesList = seriesInfos .GroupBy(a => a.SeriesName) .Select(a => a.First()) .Where(a => !string.IsNullOrEmpty(a.SeriesName)) .Select(a => a.ToSeries()) .ToList(); List <Book> books = new List <Book>(); foreach (var spreadsheetCatalogData in CatalogData) { var bookCategories = new List <Category>() { CategoryExtractor.Extract(spreadsheetCatalogData.Category) }; bookCategories = bookCategories.Where(a => a != null).ToList(); var bookSeriesInfo = SeriesInfoExtractor.Extract(spreadsheetCatalogData.Series); var book = new Book { Id = Guid.NewGuid(), Title = TitleExtractor.Extract(spreadsheetCatalogData.Title), Authors = AuthorExtractor.Extract(spreadsheetCatalogData.Author), Series = bookSeriesInfo?.ToSeries(), PublishingHouse = PublishingHouseExtractor.Extract(spreadsheetCatalogData.PublishingHouse), PublishmentYear = YearExtractor.Extract(spreadsheetCatalogData.Year), ISBN = IsbnExtractor.Extract(spreadsheetCatalogData.ISBN), Language = LanguageExtractor.Extract(spreadsheetCatalogData.Language), StoragePlace = StoragePlaceExtractor.Extract(spreadsheetCatalogData.StoragePlace), Comment = CommentExtractor.Extract(spreadsheetCatalogData.Comment), Categories = bookCategories, VolumeNumber = bookSeriesInfo?.VolumeNumber }; ImportBookValidator.CheckAuthors(authors, book.Authors); ImportBookValidator.CheckSeries(seriesList, book.Series); ImportBookValidator.CheckPublishingHouse(publishingHouses, book.PublishingHouse); ImportBookValidator.CheckStoragePlace(storagePlaces, book.StoragePlace); ImportBookValidator.CheckCategory(categories, book.Categories); books.Add(book); } return(books); }
public void Extract_OnlyOneNameWithApostrophe_ShouldReturnAuthorListWithAuthorWithoutName() { // Arrange var fullName = "O'Rely"; // Act var authors = AuthorExtractor.Extract(fullName); // Assert authors.Should().HaveCount(1); var author = authors[0]; author.Id.Should().NotBeEmpty(); author.FirstName.Should().BeEmpty(); author.LastName.Should().Be(fullName); }
public void Extract_SimpleName_ShouldReturnAuthorsListWithOneElement() { // Arrange var firstName = "Andrzej"; var lastName = "Sapkowski"; var fullName = $"{firstName} {lastName}"; // Act var authors = AuthorExtractor.Extract(fullName); // Assert authors.Should().HaveCount(1); var author = authors[0]; author.Id.Should().NotBeEmpty(); author.FirstName.Should().Be(firstName); author.LastName.Should().Be(lastName); }
public void Extract_NameWithSecondName_ShouldReturnAuthorsListWithOneElement() { // Arrange var firstName = "Liliana Elena"; var lastName = "Wroska"; var fullName = $"{firstName} {lastName}"; // Act var authors = AuthorExtractor.Extract(fullName); // Assert authors.Should().HaveCount(1); var author = authors[0]; author.Id.Should().NotBeEmpty(); author.FirstName.Should().Be(firstName); author.LastName.Should().Be(lastName); }
public void Extract_NameWithRussianCharacters_ShouldReturnAuthorsListWithOneElement() { // Arrange var firstName = "АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЫыЬьЭэЮюЯя"; var lastName = "аБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЫыЬьЭэЮюЯя"; var fullName = $"{firstName} {lastName}"; // Act var authors = AuthorExtractor.Extract(fullName); // Assert authors.Should().HaveCount(1); var author = authors[0]; author.Id.Should().NotBeEmpty(); author.FirstName.Should().Be(firstName); author.LastName.Should().Be(lastName); }
public void Extract_AllNamesWithPause_ShouldReturnAuthorsListWithOneElement() { // Arrange var firstName = "Zygmunt-Karol"; var lastName = "Zeydler-Zborowski"; var fullName = $"{firstName} {lastName}"; // Act var authors = AuthorExtractor.Extract(fullName); // Assert authors.Should().HaveCount(1); var author = authors[0]; author.Id.Should().NotBeEmpty(); author.FirstName.Should().Be(firstName); author.LastName.Should().Be(lastName); }
public void Extract_NameWithPolishCharacters_ShouldReturnAuthorsListWithOneElement() { // Arrange var firstName = "Ąęóśłżźćń"; var lastName = "Eąóśłżźćń"; var fullName = $"{firstName} {lastName}"; // Act var authors = AuthorExtractor.Extract(fullName); // Assert authors.Should().HaveCount(1); var author = authors[0]; author.Id.Should().NotBeEmpty(); author.FirstName.Should().Be(firstName); author.LastName.Should().Be(lastName); }
public void Extract_NameWithOneInitial_ShouldReturnAuthorsListWithOneElement() { // Arrange var firstName = "B."; var lastName = "Kwiatek"; var fullName = $"{firstName} {lastName}"; // Act var authors = AuthorExtractor.Extract(fullName); // Assert authors.Should().HaveCount(1); var author = authors[0]; author.Id.Should().NotBeEmpty(); author.FirstName.Should().Be(firstName); author.LastName.Should().Be(lastName); }
public void Extract_AllNamesWithApostrophe_ShouldReturnAuthorsListWithOneElement() { // Arrange var firstName = "A'manda"; var lastName = "O'rely"; var fullName = $"{firstName} {lastName}"; // Act var authors = AuthorExtractor.Extract(fullName); // Assert authors.Should().HaveCount(1); var author = authors[0]; author.Id.Should().NotBeEmpty(); author.FirstName.Should().Be(firstName); author.LastName.Should().Be(lastName); }
public void Extract_AndOthers_ShouldReturnAuthorsListWithOneElement() { // Arrange var firstName = "Anna"; var lastName = "Kwiatek"; var extraText = " i inni"; var fullName = $"{firstName} {lastName}{extraText}"; // Act var authors = AuthorExtractor.Extract(fullName); // Assert authors.Should().HaveCount(1); var author = authors[0]; author.Id.Should().NotBeEmpty(); author.FirstName.Should().Be(firstName); author.LastName.Should().Be(lastName); }
public void Extract_TwoAuthors_ShouldReturnAuthorsListWithTwoElement() { // Arrange var firstNameFirstAuthor = "Anne"; var lastNameFirstAuthor = "Plichota"; var firstNameSecondAuthor = "Cendrine"; var lastNameSecondAuthor = "Wolf"; var fullName = $"{firstNameFirstAuthor} {lastNameFirstAuthor}; {firstNameSecondAuthor} {lastNameSecondAuthor}"; // Act var authors = AuthorExtractor.Extract(fullName); // Assert authors.Should().HaveCount(2); var firstAuthor = authors[0]; var secondAuthor = authors[1]; firstAuthor.Id.Should().NotBeEmpty(); firstAuthor.FirstName.Should().Be(firstNameFirstAuthor); firstAuthor.LastName.Should().Be(lastNameFirstAuthor); secondAuthor.Id.Should().NotBeEmpty(); secondAuthor.FirstName.Should().Be(firstNameSecondAuthor); secondAuthor.LastName.Should().Be(lastNameSecondAuthor); }
public CdaDocument Extract(XmlDocument cdaDocument) { if (cdaDocument == null) { throw new ArgumentException("'cdaDocument' cannot be null"); } var cdaXmlDocument = new CdaXmlDocument(cdaDocument); IDictionary <string, string> documentXPaths = _documentXPathProvider.GetDocumentXPaths(cdaXmlDocument.TemplateId); if (documentXPaths == null) { throw new ArgumentException("Document with template ID '" + cdaXmlDocument.TemplateId + "' not supported"); } // Extractors var authorOrgExtractor = new AuthorExtractor(documentXPaths); var author = authorOrgExtractor.Extract(cdaXmlDocument); var medicationsExtractor = new MedicationsExtractor(documentXPaths); var medications = medicationsExtractor.Extract(cdaXmlDocument); var adverseReactionsEx = new AdverseReactionsExtractor(documentXPaths); var adverseReactions = adverseReactionsEx.Extract(cdaXmlDocument); var documentMetadataExtractor = new DocumentMetadataExtractor(documentXPaths); var documentMetadata = documentMetadataExtractor.Extract(cdaXmlDocument); var medicalHistoryExtractor = new MedicalHistoryExtractor(documentXPaths); var medicalHistoryData = medicalHistoryExtractor.Extract(cdaXmlDocument); var subjectOfCareExtractor = new SubjectOfCareExtractor(documentXPaths); var subjectOfCare = subjectOfCareExtractor.Extract(cdaXmlDocument); var immunisationsEx = new ImmunisationsExtractor(documentXPaths); var immunisations = immunisationsEx.Extract(cdaXmlDocument); var consumerNoteEx = new ConsumerNoteExtractor(documentXPaths); var consumerNote = consumerNoteEx.Extract(cdaXmlDocument); var advanceCareInformationExtractor = new AdvanceCareInformationExtractor(documentXPaths); var advanceCareInformation = advanceCareInformationExtractor.Extract(cdaXmlDocument); var pbsExtractor = new PharmaceuticalBenefitItemExtractor(documentXPaths); var pbs = pbsExtractor.Extract(cdaXmlDocument); var diagnosticImagingExtractor = new DiagnosticImagingExtractor(documentXPaths); var diagnosticImaging = diagnosticImagingExtractor.Extract(cdaXmlDocument); var pathologyExtractor = new PathologyExtractor(documentXPaths); var pathology = pathologyExtractor.Extract(cdaXmlDocument); var document = new CdaDocument { Author = author, Medications = medications, AdverseReactions = adverseReactions, MedicalHistoryItems = medicalHistoryData, DocumentMetadata = documentMetadata, SubjectOfCare = subjectOfCare, ImmunisationItems = immunisations, ConsumerNote = consumerNote, AdvanceCareInformation = advanceCareInformation, PharmaceuticalBenefitItems = pbs, Pathology = pathology, DiagnosticImaging = diagnosticImaging }; return(document); }