public Task <ITextCorpus> CreateAsync(IEnumerable <string> projects, TextCorpusType type) { var wordTokenizer = new LatinWordTokenizer(); var texts = new List <IText>(); foreach (string projectId in projects) { string dir = null; switch (type) { case TextCorpusType.Source: dir = "source"; break; case TextCorpusType.Target: dir = "target"; break; } foreach (string file in Directory.EnumerateFiles(Path.Combine(_textFileDir, projectId, dir), "*.txt")) { var text = new TextFileText(wordTokenizer, $"{projectId}_{Path.GetFileNameWithoutExtension(file)}", file); texts.Add(text); } } return(Task.FromResult <ITextCorpus>(new DictionaryTextCorpus(texts))); }
private async Task <IReadOnlyList <IText> > CreateTextsAsync(IEnumerable <string> projects, TextCorpusType type) { StringTokenizer wordTokenizer = new LatinWordTokenizer(); IMongoDatabase database = _mongoClient.GetDatabase(_dataAccessOptions.Value.MongoDatabaseName); IMongoCollection <BsonDocument> textDataColl = database.GetCollection <BsonDocument>( SFDataAccessConstants.TextDataCollectionName); var texts = new List <IText>(); foreach (string projectId in projects) { TextType textType; switch (type) { case TextCorpusType.Source: textType = TextType.Source; break; case TextCorpusType.Target: textType = TextType.Target; break; default: throw new InvalidEnumArgumentException(nameof(type), (int)type, typeof(TextType)); } List <TextEntity> textList = await _texts.Query().Where(t => t.ProjectRef == projectId).ToListAsync(); foreach (TextEntity text in textList) { foreach (Chapter chapter in text.Chapters) { FilterDefinition <BsonDocument> filter = Builders <BsonDocument> .Filter .Eq("_id", TextEntity.GetTextDataId(text.Id, chapter.Number, textType)); BsonDocument doc = await textDataColl.Find(filter).FirstAsync(); texts.Add(new SFScriptureText(wordTokenizer, projectId, text.Id, chapter.Number, doc)); } } } return(texts); }
private async Task <IReadOnlyList <IText> > CreateTextsAsync(IEnumerable <string> projects, TextCorpusType type) { StringTokenizer wordTokenizer = new LatinWordTokenizer(); IMongoDatabase database = _mongoClient.GetDatabase(_dataAccessOptions.Value.MongoDatabaseName); IMongoCollection <BsonDocument> textDataColl = database.GetCollection <BsonDocument>( _realtimeService.GetCollectionName <TextData>()); var texts = new List <IText>(); foreach (string projectId in projects) { var project = await _realtimeService.GetSnapshotAsync <SFProject>(projectId); if (string.IsNullOrWhiteSpace(project.TranslateConfig.Source?.ProjectRef)) { throw new DataNotFoundException("The source project reference is missing"); } string textCorpusProjectId = type switch { TextCorpusType.Source => project.TranslateConfig.Source.ProjectRef, TextCorpusType.Target => projectId, _ => throw new InvalidEnumArgumentException(nameof(type), (int)type, typeof(TextCorpusType)), }; foreach (TextInfo text in project.Texts.Where(t => t.HasSource)) { foreach (Chapter chapter in text.Chapters) { string id = TextData.GetTextDocId(textCorpusProjectId, text.BookNum, chapter.Number); FilterDefinition <BsonDocument> filter = Builders <BsonDocument> .Filter.Eq("_id", id); BsonDocument doc = await textDataColl.Find(filter).FirstOrDefaultAsync(); if (doc != null && doc.TryGetValue("ops", out BsonValue ops) && ops as BsonArray != null) { texts.Add(new SFScriptureText(wordTokenizer, projectId, text.BookNum, chapter.Number, doc)); } } } } return(texts); }
private async Task <IReadOnlyList <IText> > CreateTextsAsync(IEnumerable <string> projects, TextCorpusType type) { StringTokenizer wordTokenizer = new LatinWordTokenizer(); IMongoDatabase sfDatabase = _mongoClient.GetDatabase("scriptureforge"); IMongoDatabase realtimeDatabase = _mongoClient.GetDatabase("realtime"); IMongoCollection <BsonDocument> projectsColl = sfDatabase.GetCollection <BsonDocument>("projects"); var texts = new List <IText>(); foreach (string projectId in projects) { Project project = await _projectRepo.GetAsync(projectId); if (project == null) { continue; } string segmentType = null; string suffix = null; switch (type) { case TextCorpusType.Source: suffix = "source"; segmentType = project.SourceSegmentType; break; case TextCorpusType.Target: suffix = "target"; segmentType = project.TargetSegmentType; break; } StringTokenizer segmentTokenizer = null; if (segmentType != null) { segmentTokenizer = WebApiUtils.CreateSegmentTokenizer(segmentType); } FilterDefinition <BsonDocument> filter = Builders <BsonDocument> .Filter.Eq("_id", ObjectId.Parse(projectId)); BsonDocument projectDoc = await projectsColl.Find(filter).FirstOrDefaultAsync(); if (projectDoc == null) { continue; } var code = "sf_" + (string)projectDoc["projectCode"]; var isScripture = (bool)projectDoc["config"]["isTranslationDataScripture"]; IMongoCollection <BsonDocument> projectColl = realtimeDatabase.GetCollection <BsonDocument>(code); IMongoDatabase projectDatabase = _mongoClient.GetDatabase(code); IMongoCollection <BsonDocument> translateColl = projectDatabase.GetCollection <BsonDocument>("translate"); filter = Builders <BsonDocument> .Filter.Eq("isDeleted", false); using (IAsyncCursor <BsonDocument> cursor = await translateColl.Find(filter).ToCursorAsync()) { while (await cursor.MoveNextAsync()) { foreach (BsonDocument docInfo in cursor.Current) { var id = (ObjectId)docInfo["_id"]; filter = Builders <BsonDocument> .Filter.Eq("_id", $"{id}:{suffix}"); BsonDocument doc = await projectColl.Find(filter).FirstAsync(); if (isScripture) { texts.Add(new XForgeScriptureText(wordTokenizer, project.Id, doc)); } else { texts.Add(new XForgeRichText(segmentTokenizer, wordTokenizer, project.Id, doc)); } } } } } return(texts); }
public async Task <ITextCorpus> CreateAsync(IEnumerable <string> projects, TextCorpusType type) { return(new DictionaryTextCorpus(await CreateTextsAsync(projects, type))); }
private async Task <IReadOnlyList <IText> > CreateTextsAsync(IEnumerable <string> projects, TextCorpusType type) { StringTokenizer wordTokenizer = new LatinWordTokenizer(); IMongoDatabase database = _mongoClient.GetDatabase(_dataAccessOptions.Value.MongoDatabaseName); IMongoCollection <BsonDocument> textDataColl = database.GetCollection <BsonDocument>( _realtimeService.GetCollectionName <TextData>()); var texts = new List <IText>(); foreach (string projectId in projects) { var project = await _realtimeService.GetSnapshotAsync <SFProject>(projectId); if (string.IsNullOrWhiteSpace(project.TranslateConfig.Source?.ProjectRef)) { throw new DataNotFoundException("The source project reference is missing"); } string textCorpusProjectId; string paratextId; switch (type) { case TextCorpusType.Source: textCorpusProjectId = project.TranslateConfig.Source.ProjectRef; paratextId = project.TranslateConfig.Source.ParatextId; break; case TextCorpusType.Target: textCorpusProjectId = projectId; paratextId = project.ParatextId; break; default: throw new InvalidEnumArgumentException(nameof(type), (int)type, typeof(TextCorpusType)); } foreach (TextInfo text in project.Texts.Where(t => t.HasSource)) { foreach (Chapter chapter in text.Chapters) { string id = TextData.GetTextDocId(textCorpusProjectId, text.BookNum, chapter.Number); FilterDefinition <BsonDocument> filter = Builders <BsonDocument> .Filter.Eq("_id", id); BsonDocument doc = await textDataColl.Find(filter).FirstOrDefaultAsync(); if (doc != null && doc.TryGetValue("ops", out BsonValue ops) && ops as BsonArray != null) { texts.Add(new SFScriptureText(wordTokenizer, projectId, text.BookNum, chapter.Number, doc)); } } } string termRenderingsFileName = Path.Combine(_siteOptions.Value.SiteDir, "sync", paratextId, "target", "TermRenderings.xml"); if (_fileSystemService.FileExists(termRenderingsFileName)) { using var stream = _fileSystemService.OpenFile(termRenderingsFileName, FileMode.Open); XDocument termRenderingsDoc = await XDocument.LoadAsync(stream, LoadOptions.None, CancellationToken.None); texts.Add(new SFBiblicalTermsText(wordTokenizer, projectId, termRenderingsDoc)); } } return(texts); }