public async IAsyncEnumerable <BiorxivHtmlFileInfoDto> FindNewBiorxivStudiesAndSave(string rssFeedUrl, Guid livingSearchId, Guid fileId, Guid projectId, string description, int batchSize) { var fileNumber = 1; var studyNumber = 0; var nodes = new Collection <HtmlNode>(); var biorxivStudyReferences = new Collection <BiorxivStudyReference>(); var stream = await GetStream(rssFeedUrl); var web = new HtmlWeb(); using var result = new StreamReader(stream); var xmlReaderSettings = new XmlReaderSettings() { Async = true }; using var reader = XmlReader.Create(stream, xmlReaderSettings); await reader.MoveToContentAsync(); XNamespace xmlns = reader[6]; while (!reader.EOF) { if (reader.NodeType == XmlNodeType.Element && reader.Name == "item") { if (!(XNode.ReadFrom(reader) is XElement el)) { continue; } var doi = el.Descendants(xmlns + "identifier").First().Value.Split(":")[1]; var studyPageUrl = el.FirstAttribute.Value; if (await _lsUnitOfWork.BiorxivStudyReferenceRepository.ContainsReferenceWith(projectId, doi)) { continue; } var studyId = Guid.NewGuid(); el.Add(new XElement("StudyId", studyId.ToString())); biorxivStudyReferences.Add(new BiorxivStudyReference(studyId, projectId, livingSearchId, doi, studyPageUrl)); HtmlDocument studyPage; try { studyPage = await web.LoadFromWebAsync(studyPageUrl); nodes.Add(studyPage.DocumentNode.SelectSingleNode("//head")); } catch (Exception e) { Console.WriteLine( $"An error occured while fetching study with doi: {doi} from BioRxiv. Error message: {e.Message}"); studyPage = new HtmlDocument(); studyPage.DocumentNode.SelectSingleNode("html").AppendChild( HtmlNode.CreateNode( $"<head>An error occured while getting study with DOI: {doi} from BioRxiv. Error message: {e.Message}</head>")); nodes.Add(studyPage.DocumentNode.SelectSingleNode("//head")); } studyNumber++; if (studyNumber != batchSize) { continue; } var fileUri = await SaveBatchOfBiorxivHeadNodes(nodes, projectId, livingSearchId, fileId, description, fileNumber); await _lsUnitOfWork.SaveManyAsync(biorxivStudyReferences); nodes.Clear(); biorxivStudyReferences.Clear(); yield return(new BiorxivHtmlFileInfoDto(fileUri, fileNumber, studyNumber)); studyNumber = 0; fileNumber++; } else { await reader.ReadAsync(); } } if (studyNumber != 0) { var fileUri = await SaveBatchOfBiorxivHeadNodes(nodes, projectId, livingSearchId, fileId, description, fileNumber); await _lsUnitOfWork.SaveManyAsync(biorxivStudyReferences); yield return(new BiorxivHtmlFileInfoDto(fileUri, fileNumber, studyNumber)); } }
public async IAsyncEnumerable <PubmedXmlFileInfoDto> FindNewPubmedStudiesAndSave(Guid livingSearchId, Guid fileId, string description, Guid projectId, string searchTerm, int batchSize) { var queryWebResult = await _pubmedWebClient.SubmitSearch(searchTerm); var fileNumber = 1; var numberOfNewReference = 0; var totalFileNumber = queryWebResult.Count / batchSize; var nodes = new Collection <XElement>(); var pubmedStudyReferences = new Collection <PubmedStudyReference>(); for (var retStart = 0; retStart < queryWebResult.Count; retStart += batchSize) { var xmlString = await _pubmedWebClient.GetRecordsXmlString(queryWebResult.WebEnv, queryWebResult.QueryKey, batchSize, retStart); var xmlReaderSettings = new XmlReaderSettings() { Async = true }; using var reader = XmlReader.Create(xmlString, xmlReaderSettings); await reader.MoveToContentAsync(); while (!reader.EOF) { if (reader.NodeType == XmlNodeType.Element && reader.Name == "PubmedArticle") { if (!(XNode.ReadFrom(reader) is XElement el)) { continue; } var pubmedId = el.Descendants("PMID").First().Value; if (await _lsUnitOfWork.PubmedStudyReferenceRepository.ContainsReferenceWith(projectId, pubmedId)) { continue; } var studyId = Guid.NewGuid(); el.Add(new XElement("StudyId", studyId.ToString())); pubmedStudyReferences.Add(new PubmedStudyReference(studyId, projectId, livingSearchId, el.Descendants("ELocationID").First().Value, pubmedId)); nodes.Add(el); numberOfNewReference++; if (numberOfNewReference != batchSize) { continue; } var fileUri = await SaveBatchOfPubmedNodes(nodes, projectId, livingSearchId, fileId, description, fileNumber); nodes.Clear(); await _lsUnitOfWork.SaveManyAsync(pubmedStudyReferences); pubmedStudyReferences.Clear(); yield return(new PubmedXmlFileInfoDto(fileUri, queryWebResult.WebEnv, queryWebResult.Count, fileNumber, numberOfNewReference, totalFileNumber, queryWebResult.QueryKey)); numberOfNewReference = 0; fileNumber++; } else { await reader.ReadAsync(); } } } if (numberOfNewReference != 0) { var fileUri = await SaveBatchOfPubmedNodes(nodes, projectId, livingSearchId, fileId, description, fileNumber); await _lsUnitOfWork.SaveManyAsync(pubmedStudyReferences); yield return(new PubmedXmlFileInfoDto(fileUri, queryWebResult.WebEnv, queryWebResult.Count, fileNumber, numberOfNewReference, totalFileNumber, queryWebResult.QueryKey)); } }